TA的每日心情 | 开心 2021-1-4 22:53 |
---|
签到天数: 5 天 [LV.2]偶尔看看I
|
发表于 2021-1-4 02:18:57
|
显示全部楼层
- #!/usr/bin/env python3$ x7 R8 V, l8 m9 @2 [- z- |, n
- # -*- coding: utf-8 -*-- ^7 W2 @1 D6 T6 p% o1 w+ [! M
- + o; P. t ]4 Q/ w9 R$ X
- import os/ @0 u* d" ?$ }- f v
- import requests' S# B1 M* J4 y8 g+ a& Y
- import re
; y3 G6 f7 m% b( y/ i - import time
0 B3 R; V0 ^2 P+ a: `( j" j; c - import sqlite31 ]6 j# e: D2 {' ?- h; G# }
- ( a8 ^* c5 f9 F |+ Z
- from string import ascii_lowercase
& }, Y$ J- C! D- u3 n {& K0 q - 0 Q. C9 x0 a' L# a$ @( a& n- T
- from pathlib import Path
( M6 I: J( y% Z1 ]7 A* y - from urllib.parse import urljoin
3 R2 i5 N; N# V3 u - from html.parser import HTMLParser
: w; h0 T. b* ^# m( ] - from bs4 import BeautifulSoup4 S( K$ b8 @) A! q; a+ A( \
4 d8 y7 A1 p; a8 W- webster_url = "https://www.merriam-webster.com/browse/dictionary/"0 o, O* l' I) E( P
- oxford_url = "https://www.oxfordlearnersdictionaries.com/browse/english/"
; X% T/ R0 m; f: K/ P3 i - macmillan_url = "https://www.macmillandictionary.com/browse/british/"0 B2 B/ M# v: F* E- Z/ O. V0 s- |
- cambridge_url ="https://dictionary.cambridge.org/browse/english/"
3 i' H+ ] G6 W+ ?$ [/ t1 k" }
, @: g+ S; Q: T6 ?1 P" ^- a# S- cambridge_english_chinese_url = "https://dictionary.cambridge.org/browse/english-chinese-simplified/"' |% u- D" `3 q0 A: V
- # M+ V1 h. C9 C# o
- base_url = "https://dictionary.cambridge.org/search/direct/"# K; a/ c2 i w R% K. z x
- header = {"User-Agent": "Chrome/87.0.4280.88 Safari/537.36"}" B7 R2 A3 [+ E8 s, I% ~! B5 A# V
- payload = {'datasetsearch': 'english-chinese-simplified', 'q': ""}
- }3 m1 ^. g7 K; k# P# z
% B" f/ }$ o: Q5 |- conn = sqlite3.connect("Cambridge Advanced Learner's English-Chinese Dictionary.db")2 h$ ]6 R% x! t, G# O
- . F! i# y; P. d. x
- conn.execute("""
# X8 e+ i) `/ _# \5 L% ^ - CREATE TABLE IF NOT EXISTS cambridge (
( ?% j7 E. p6 t) Y - entry TEXT PRIMARY KEY) Q2 p4 k% d3 A' t0 `5 i3 q8 z
- UNIQUE3 ]* q! M! D( G1 R3 d& I* Y; f
- NOT NULL,, e9 [- e1 f& u' I
- url TEXT NOT NULL,5 H: e' t- k1 ?
- html TEXT NOT NULL,
) u, o, P: ^1 U( Q# d$ D - latex TEXT
$ ]. F3 R# n! q" C6 h# M; { - );
4 \: J' N Z' f - """). y% C+ j( o, x' b2 u) J' [
# l, ~- w; \ e- conn.commit()8 T* u% M$ c5 T/ R+ |
4 A& l; e" u1 `# R; g' \9 L- ; a5 Z! |& d' R8 Y% o
- def latexify(string):! ~9 K/ ]7 F! ~( @" A! Y$ y
- result = ""
% R* E8 ` N- ` Y/ c0 e+ ~# V" t- I - trimmed_string = re.sub(r"\s{2,}", " ", string)4 W3 B9 U( m$ g' c; H9 K
- for char in re.sub(r"\s+([^\w'"({[])", r"\1", trimmed_string):+ V- l: T4 J. m* K9 A' ]" q
- if char == "%":
* \$ @. l6 L0 p, ?" f - result += r"\%"" h1 R# N0 S6 {. L+ M, s O6 L! b
- elif char == "\":
4 b* m7 h( X! Q/ S - result += r"\textbackslash{}"0 j ]& C5 `0 {( n# p
- elif char == "$":
" {$ Y& [: |) g1 g - result += r"\$"/ ~+ H" }* F A8 r" ]( Q) N0 ~
- elif char == "#":
3 X! X% g% U- `6 ]0 z - result += r"\#"& }" U6 e8 r# W3 `+ y3 w
- elif char == "&":
$ E! r. m, ~1 ~' q. W: | T - result += r"\&"
: I, U1 i! v1 v# F h- M5 L8 T1 x - elif char == "{":% L' ]7 Z# X0 { m, W: x
- result += r"\{") p/ p# d" X3 [" _8 J |: X( d Q
- elif char == "}":8 ~4 i; J" [2 L; j G0 |% h6 s
- result += r"\}"
v4 k3 C5 T3 o/ \6 A - elif char == "^":
$ A0 i* d; L: R' m* Q - result += r"\^"
% z. a. x& L5 { - elif char == "_":$ R4 V/ S& s. Z! @+ @
- result += r"\_"
" \5 s/ w( l' O3 J - elif char == "~":" ^: t0 D( ^& y% `( i, a: b
- result += "r\textasciitilde{}"
$ d" }( t6 q w# @ - else:5 c* A; N) l! V
- result += char
# Q) [9 e: w* s' p* W+ _5 \4 v8 F0 S - return result6 m) H8 u& m" ]/ Z+ [3 g
- " w( B x% H+ k8 R, Z7 ^2 T, z
- def latexify_html(beautifulsoup_object):. w# H$ F4 H1 U4 E' P R
- try:( y- A! ^% o2 I' f& Y& I/ j
- return latexify(re.sub(r"\s{2,}", " ", beautifulsoup_object.get_text().replace("\n", " ")).strip())
9 e2 ]9 I$ q: ]& Z' H9 e& b - except:
; X8 H, o1 v5 ^) r c$ ? - return latexify(beautifulsoup_object)5 w* p5 i/ b2 _+ h# S9 z% ]; \
. K2 G( f2 B% [ A- 4 @9 `: `) K5 p/ u
- class CambridgeDictionaryScraper:' V8 A1 e$ e9 S% Z! E+ ^( p* n
- """ Scraper for Cambridge Dictionary """6 k0 @; n: P9 @ W2 \; ^
- url_set = set() ## Shared by all instance9 Q# O2 E0 @7 i; `2 F! L) K" q, w% C
5 y# ~; I. X u$ p4 M$ O0 K8 z0 E! I- def __init__(self):4 r; s- ?; N8 C9 B
- for item in conn.execute("SELECT url FROM cambridge;"):. ^6 N, P3 z* A6 w
- self.url_set.add(item[0])) k) f& h k7 x' Z; n/ t w3 Q
- print("Already Downloaded " + str(len(self.url_set)) + " Words!")7 E# [3 R. j$ `9 b
" j) J7 H5 P/ h" U$ _% z4 p4 x
$ ?- \5 X) A5 P+ ^- def __del__(self):
" A3 F) S+ U6 u3 k4 D - conn.commit()
# h5 v; }+ j+ L: o. B - . Q0 a: }3 r4 \. G! P- Z, L2 O$ e
4 e- ~+ f6 G# l2 _* B: S- def get_word_page(self, url):
+ a8 o, P, R* p$ ^( E# o: | - r = requests.get(url, headers=header) % R' v; }" q6 `
- bs_obj = BeautifulSoup(r.text, "html.parser")
( T6 v2 ?6 L5 p( u6 y. c - entry_tag = bs_obj.find("div", {"class": "entry"})" C/ ?5 x3 Z' p' N! v! L9 n
- if not entry_tag:) k+ [, h0 i8 N+ W4 T
- entry_tag = bs_obj.find("div", {"class": "di-body"})
; s/ V/ \1 I2 V* L - if not entry_tag:* @' |# R9 o! I0 n) f
- ## Beta Words
$ ]1 d0 M; m0 L: g* o! n9 t3 y! f - entry_tag = bs_obj.find("div", {"id": "entryContent"}); N3 { P$ f7 A' S0 M' V8 q4 o
- if not entry_tag:) Y& w% N1 R" h+ q/ @
- entry_tag = bs_obj
- l; b# J1 S$ I' N
4 t* c) r3 H+ P" a( B: i, ~- _# M- if not entry_tag:& ~( T. K7 g7 @, c' {8 R4 e
- entry_tag = bs_obj
) D% F* v- _" z -
; y2 z/ Q- Q- K, v9 u - for tag in entry_tag.find_all("script"):
8 a$ |3 k& b, T; I- L: a+ \4 T - tag.extract()
8 ^/ {& ?! x- _- x -
9 P) T, @7 p4 { - result_string = str(entry_tag)9 h7 @/ l4 S7 r4 d& |
- return result_string# W/ w8 b- ^" j7 z$ n. ^
+ c% V' N3 ]' T5 }& f, L5 Q-
2 e, h' ^! K% o$ M - def start(self, url):1 B' A; V: B7 k) ?! @! X$ Z: ]( Y
- r = requests.get(url, headers=header)- B- ~$ O0 E5 M! G4 G% u: r* w" ?
+ G3 ~. t& M1 W1 y$ Y/ |( K- bs_obj = BeautifulSoup(r.text, "html.parser")
( h; N, c4 @& U9 e$ a5 L- A# g; R5 C
* l/ k$ M9 @8 ?$ K7 Y) B- for li_tag in bs_obj.select("li.lpr-10"):# Y1 `* X0 r7 ?8 @; Z
- child_url = urljoin(cambridge_url, li_tag.a.attrs["href"])& w4 ?" I9 P. ?, b6 B
- print(child_url)% m, y2 @" w! m+ Z- X, G
- self.find_child_entry(child_url)5 p. t& @6 Q% q. ]! n `3 j
2 H) A. V9 K5 Z
]; G& P1 ~1 }, q" Z& Y" h3 f- def find_child_entry(self, url):
9 y( o8 c4 |. a$ w# M - r = requests.get(url, headers=header). w6 ~2 t Q# }; d
- bs_obj = BeautifulSoup(r.text, "html.parser")
+ `, t. o% K4 f - for li_tag in bs_obj.select("li.t-i"):
* {% N0 U" `9 d5 L" d - child_url = urljoin(url, li_tag.a.attrs["href"]).strip()0 b) k1 n. x* S T C: }) U
- child_text = li_tag.get_text().strip()
' G, m2 `1 O0 N' Y. a - if "..." in child_text: ?- E/ R b! l2 I K2 M8 E
- self.find_child_entry(child_url)+ Q6 u% D# A% A- [; T, Y
- else:
1 n8 ~( z' U% A- S" D) a5 \$ D - if child_url in self.url_set:
: G+ y L7 c0 |& y9 `* n - continue
7 d1 R/ I% p$ e, l8 H/ L9 ?) T8 ? - print(child_text + "\t" + child_url)
4 `- b' }9 |' y. ?2 a* m3 Z - conn.execute("INSERT INTO cambridge (entry, url, html) VALUES (?, ?, ?);",
% X+ J8 Q; s3 ]+ ` - (child_text, child_url, self.get_word_page(child_url)))) \8 `. w* v8 f& T# e
- conn.commit()! V: J9 w% d6 }( g4 ?# o
- self.url_set.add(child_url)
8 a7 z0 @: c0 ]
$ ?5 Z; Q; G& \+ K8 K
9 o" u% j! K7 \- class CambridgeDictionaryExtractor():
+ m/ \7 d* e4 A* _" X$ d - def __init__(self, entry, entry_html = ""):2 G' Q- v0 Q7 u: A7 S& k
- self.entry = latexify(entry)
2 _. ]; U# I O0 W% z - self.entry_html = entry_html1 n# c* ?6 w4 b1 R" Y
- self.result = ""2 u6 c! }! u7 }& A" W
- `/ r. U" L( Y/ L! {" a: M7 M- f- # def __del__(self):( N/ ~+ n1 z0 F# u
- # pass
% y* @/ Q0 O0 b* y7 n4 v% y - $ \4 \% z. F8 E) x% L, r# c
0 A% K, x9 d9 x8 G+ V8 W! D- def extract(self):
0 n7 R! g/ U# L& g, v - """/ ~2 L' D0 V- k+ f- d% V# L
- <div class="pr idiom-block">3 g& L) a5 D4 }
- <div class="idiom-block"></div>+ v" I- ]* a( z$ {& U9 L4 u u
- </div>
/ ]' |6 t) Z7 a0 L; U - """
0 s' D* j5 x8 }/ Q5 b8 _. }3 V2 x' j! O - bs_obj = BeautifulSoup(self.entry_html, "html.parser")8 |$ x( n/ z; y! R0 K5 s
- self.result += "\\begin{entry}{" + self.entry + "}"
( G9 g: F, S1 v* A) `) G - for part_of_speech_tag in bs_obj.find_all("div", {"class": "entry-body__el"}):
+ M5 k* Q! E1 B X7 I' @ - self.result += "\n\\begin{Partofspeech}\n" + self.process_part_of_speech(part_of_speech_tag) + "\n\\end{Partofspeech}"* r8 |" m# U* y. P) U0 ?9 ~
- idiom_block = bs_obj.find("div", {"class": "idiom-block"})
# j- s# T0 f( {9 k+ C1 H$ c% H7 \ - if idiom_block:( z) j" n8 u4 C) P: g
- for idiom_block in idiom_block.find_all("div", {"class": "idiom-block"}):
8 g0 a& ]7 u" c/ d/ { - self.result += "\n\\begin{idiom}" + self.process_idiom_block(idiom_block) + "\n\\end{idiom}"
; t. e: J7 ?: D- B# } - self.result += "\n\\end{entry}\n\n"3 B; g' \8 Y& }& _6 F
- , u1 E% ~. z3 } |; c s
- : G" f2 M" w& Y$ {5 m! j
- def process_idiom_block(self, idiom_block):3 Y! F, h# W& g+ P d
- result = ""2 t1 P" ~/ m( C* d/ O" L6 ?) A
- idiom_body = idiom_block.find("span", {"class": "idiom-body"})
7 ]$ [2 K( {. J3 J) h' h' s1 [ - if idiom_body:1 a6 D7 A4 _! y
- for sense_tag in idiom_body.find_all("div", {"class": "dsense"}):! ^- w1 J3 `1 y' J' z
- result += self.process_sense_tag(sense_tag)/ n# V; s/ z0 x
- return result
- h$ N& }; @! P
5 |' P3 B2 l1 f8 W0 i-
* a3 P- c* \% h9 t* Y! s) m8 v$ o - & G. Y; P/ B( |( g$ Q
- def get_smart_vocabulary(self, smart_vocabulary_tag):# ^/ F/ w$ v6 r9 U d6 [8 C
- result = ""! P T; k6 B/ ]( J: F2 O- A
- for li_tag in smart_vocabulary_tag.find_all("li"):5 s/ W) J m2 Z$ w- f/ G! q5 N
- result += "\\smart{" + latexify_html(li_tag) + "}\n"
3 S& Z+ P% B7 P; ^. ]! P& X+ g - return "\n\\begin{smartvocabulary}\n" + result + "\\end{smartvocabulary}\n"
: I( x) W5 i* I5 z1 b J) ^, f
( c2 a ^4 F0 h9 N- w
/ v9 r m) |$ `7 `2 {* r- def process_part_of_speech(self, part_of_speech_tag):6 k, q( f$ Q( l- b. H
- """
9 Z+ L9 t; X, ~ - <div class="entry-body__el">
) s7 b- h" q4 ~# y! ]' L - <div class="pos-header"></div>
( ]6 `# a) M I7 j - <div class="pos-body"></div>, i- d V% k! {+ J1 Y
- <div class="pr relativDiv"></div>
# L L# p: ?0 G" n - <div># m; u& _5 w/ }7 A( P
- """* S$ P N# H7 _
- result = "", d! V' z+ [' z! v) J! h
- header_tag = part_of_speech_tag.find("div", {"class": "pos-header"})
! E, |# A) s7 k2 W- A$ d/ F' J - if header_tag:0 y/ V8 r) p8 U# K2 t
- result += self.process_part_of_speech_header(header_tag)
9 @# m1 R% Q8 |. u$ v. m o4 o( h - body_tag = part_of_speech_tag.find("div", {"class": "pos-body"})$ k! D' m X. W; ~$ i8 Y
- if body_tag:
/ @& d% U2 E5 |* Z - result += self.process_part_of_speech_body(body_tag)
+ R# W2 c7 \, D3 K( t - pv_block_tag = part_of_speech_tag.find("div", {"class": "pv-block"})+ x5 f. K6 |% O2 J: ?4 H' K
- if pv_block_tag:
( z. I9 r/ w5 U# x7 D8 ?6 a/ w& ` - result += self.process_pv_block(pv_block_tag)
* M+ ~6 q, a6 y1 ~2 v - return result.strip()
- ^2 G3 ~; M2 H6 i: ]2 ^$ |; v - % Z1 t7 F A3 ~0 W
- $ i5 i" M( b* m% u
- def process_pv_block(self, tag):+ e6 \/ z- U) _- b- N4 K$ Q
- """
1 l8 C. ~! q; c* J* @3 w - <div class="pv-block">$ g, X% Z+ Z1 e
- <div class="di-title"></div>
9 f6 y8 p, w4 _ G5 R - <span clss="di-info"></span>- B3 m! b+ c6 i1 z4 J( ^5 s
- <span class="pv-body dpv-body"># f% H$ _" _1 q% ?# q. R
- <div class="pr dsense dsense-noh">
' Q. g& ]1 j5 b& I& c* K - <span>" J1 L8 J5 x- P5 r, S% x) e
- <div>/ ^ l* h) q) q. r t# \
- """6 {1 c# b% N3 }8 O
- result = ""
; `$ N4 a2 d' `) I6 f - for item in tag.find_all("div",{"class", "sense-body"}):
9 i- b* l E" j! I* y9 O' o* y - result += self.process_sense_body(item)8 y- O: h/ l; ^) |; u
- return result# z$ e( l$ ~1 |( P$ W. ^
1 E8 D( [/ L3 w u$ x$ K
( r Z& s% G1 ~% K* y& R- 3 L% |" N$ ?+ `
- def process_part_of_speech_header(self, header_tag):, Z( a6 k5 X8 K2 b3 }: d
- result = ""5 V' l4 h; H) q
- # title_tag = header_tag.find("div", {"class": "di-title"})
3 Z$ w3 c- v$ U - # if title_tag:
9 }2 A% h* J7 | - # result += process_header_title(title_tag)! d' k) ~- r" c% j& P. T
- posgram_tag = header_tag.find("div", {"class": "posgram"})6 |# P4 K1 n" f! u+ G; L4 x
- if posgram_tag:
: C1 i8 a. o& J) o( \2 Q2 k - result += self.process_part_of_speech_grammar(posgram_tag), n5 W; t K% U) N
- for pronunciation_tag in header_tag.find_all("span", {"class": "dpron-i"}):
7 v% }) u$ Q+ J" | - result += self.process_pronunciation(pronunciation_tag) m, o' q- @8 Z+ c
3 q2 V" S- ^* ~ P7 s& S- return result.strip()
2 j5 p+ d* H) D g - 9 f! [' K' Y5 o
' [( H0 m' O$ Y" f/ [- def process_header_title(self, title_tag):! G) E" A0 X: A' m7 T! t. |. ?
- ## <span class="hw dhw">record</span>
0 q$ x s5 O) Z* z% ~; o - result = ""5 |! G1 w3 O+ n
- headword_tag = title_tag.find("span", {"class": "hw"})! G- Z' H+ w% ~3 x
- if headword_tag:5 S* o% E4 ]8 N+ Z2 f' i
- result += "\\entry{" + latexify_html(headword_tag) + "}\n"
N( R3 a' Z" U4 ? - else:
; S- b; x j5 a* Y# S8 ^5 p7 ^ - result += "\\entry{" + latexify_html(title_tag) + "}\n"8 n" G3 m2 c" I- }% A) ^2 r
- return result
* M) ^5 o s4 l7 _, I- O
5 u* F- ^# E3 I0 j. v- def process_part_of_speech_grammar(self, posgram_tag):
, B% n* u' a1 y - result = ""6 k# W4 R$ A' y6 ]: `/ O
- part_of_speech_tag = posgram_tag.find("span", {"class": "pos"})
1 E+ ?# m! Z) C1 m - if part_of_speech_tag:" z; ]6 E5 u. v/ g: \
- result += "\n\\pos{" + latexify_html(part_of_speech_tag) + "}"
/ v# {9 v: s; ?7 C" ^% m$ F+ h5 C* ~ - gram_tag = posgram_tag.find("span", {"class": "gc"})( j/ F' Z# |# H# o% Y; e
- if gram_tag:+ z5 f$ v1 N/ o1 B2 h% [0 z/ k
- result += "\n\\posgram{" + latexify_html(gram_tag) + "}"
' D1 W# w4 Y3 x, k7 N - return result% g0 g; N( Y7 g! _3 x* ]
- 4 ^: H4 ?# y; {0 F! i/ [
- def process_pronunciation(self, pronunciation_tag):( h4 h a' O0 X! l) A- I
- is_us_pronunciation = False$ t( @8 f! `6 Z* U
- if "us" in pronunciation_tag.attrs["class"]:
. K8 T5 Z6 N4 T+ e0 ] - is_us_pronunciation = True8 U7 [5 ]: @5 Q; s* P+ D6 W% b: r- C
- result = ""
# Q: v7 ]+ F9 H) \; x - audio_tag = pronunciation_tag.find("source", {"type": "audio/mpeg"})
! p, Z0 c1 r, R, B! W$ ? -
3 N8 S/ J3 N! g: J - ipa_tag = pronunciation_tag.find("span", {"class": "ipa"})
9 W) Z& O. E t4 k - if ipa_tag:+ ~9 c- ?+ l+ A# C& T9 H4 J
- if is_us_pronunciation:
9 l3 _3 I% {$ T t q, `+ @/ g% z! J- M - result += "\n\\ipaus{" + latexify_html(ipa_tag) + "}"
5 O4 U0 u! \, x' T; D1 H - else:5 v+ s) K. p4 d5 g* k7 v2 d
- result += "\n\\ipauk{" + latexify_html(ipa_tag) + "}"! B( A/ q g. U9 u% y' \. r
- if audio_tag:3 E2 Z# B: S9 v/ i. O
- audio_url = urljoin("https://dictionary.cambridge.org/", audio_tag.attrs["src"])9 Q8 u1 O+ X. ^7 V! P0 m, G/ i
- result += "\n\pronuniation{" + audio_url + "}"1 w8 t$ Y) z1 y1 j8 _/ P
- return result* h/ ?! _3 f2 B# I3 k3 A' Q9 y0 g+ Y
- 1 w1 C# ^% _ K- W9 x7 W' z8 o
- + Z% n- T% l! y' V8 x
- 4 R: H1 h( A% Z- v. ?
- def process_sense_head(self, tag):
; V$ @ m, m+ r0 |) w) b0 L+ j - text = latexify_html(tag)
`+ O! g6 h1 j0 R - if "(" in text:, d+ x" r. Y5 U( T/ r
- left_bracket_index = text.index("(")5 `3 W9 H. B$ S! o
- else: w$ `2 ?2 C& x1 w& ]: C4 W7 F9 M! J
- left_bracket_index = 0
$ ` ^. D; X) B - if ")" in text:
- b6 Q1 C! E: ?2 }& _3 m - right_bracket_index = text.index(")"). L0 f, z) v5 j( R" g, Q, R
- else:
+ q3 |1 R. N; \' X! [! ` - right_bracket_index = len(text)7 P, n* \' O7 C4 j, m. ?0 L
- return "\n\\shortmeaning{" + text[left_bracket_index + 1: right_bracket_index].strip() + "}"
L2 Z3 |3 \- B1 ~
: A4 Z1 \6 Y4 N
3 o# y9 a( a5 p9 W$ Z" _% }1 M- def get_definition(self, tag):3 L9 V; q. x, W' I) I# @
- result = ""& F9 N* H W) h/ s) p/ X
- for def_info_tag in tag.select("span.def-into"):( F: o( \! K! M* {# e% ~
- result += latexify_html(def_info_tag)* e+ P' X3 S H }+ o
- for def_tag in tag.select("div.def"):: x) n7 F) g; l9 G% E
- result += latexify_html(def_tag)8 y+ `( V, D; R4 U7 o% d
- return result l7 K/ k( t' V Q3 R8 c4 B
4 c3 \0 u: Q+ X3 d; L- ' `; D0 \% X4 g0 Q1 A3 o1 I8 a
- def process_def_block(self, tag):0 U* U! i' V! n4 c: n# Y
- result = ""
, v- X( H4 R$ z/ R6 H' |3 @ - def_tag = tag.find("div", {"class": "ddef_h"})# C; P: X+ f, ~, L. s4 P( [
- if def_tag:4 P1 d* t: v7 j% K- Z/ h
- result += "\n\\meaningen{" + self.get_definition(def_tag) + "}"
% W4 V4 U& i* T8 V4 L' p - try:, u \: g, V# h+ C; i% v% l
- def_trans_tag = tag.select("span.trans")[0]5 y. g6 t0 f% { K; ~
- if def_trans_tag:
6 L6 d* R8 l; f$ M4 r$ _+ V! G3 ?0 g - result += "\n\\meaningcn{" + latexify_html(def_trans_tag) + "}"
- M$ a4 \3 _/ L$ Y+ e4 F# l - except:
3 k$ o% v/ W! Z3 T - pass
( [. C2 \3 r h5 J6 u* s& E; R" Z8 ^ - for example_tag in tag.select("span.eg"):
/ \1 X# R2 [% z/ g4 ?$ o - result += "\n\\example{" + latexify_html(example_tag) + "}"& a3 s7 B# k# B. a( n- L
- return result
; c' L3 T6 C6 M6 H- O) Z) a- U
4 R" |' @% L9 i# K4 V% j- ! O. o- R( S, n6 U: F' V ~* e
- def process_phrase_block(self, phrase_block_tag):
6 O1 c% j5 h& I2 ?: R/ Y# m# e3 _ - """
/ a2 u" S3 m3 Z* B. ^0 b: s% J - <div class="phrase-head dphrase_h">...</div>
+ B% P; F7 H5 z! V3 N* c - <div class="phrase-body dphrase_b">...</div>
! q% D' ]; M3 m6 G( \ - <div class="bb hax">...</div>
& y* O3 f% l- B) Q2 R/ x - """* D* M: S: b4 |3 E
- result = "\\begin{phrase}{"$ v3 A4 q, l/ {
- result += phrase_block_tag.select("span.phrase-title")[0].get_text().strip() + "}"& ?/ \8 h% }) F) V" U) M# d/ V- k
- return result + "\\end{pharse}\n") E# y1 |7 J: u' f* ~2 n+ h# O
- ' u7 _- z/ h6 F! V& Y* T2 I: |
- def process_sense_body(self, tag):* S! e' H/ ^" x' M# x* H: r0 i8 X0 q4 X
- """, r* r% E) M: _$ m( H- I
- <div class="pr phrase-block dphrase-block">...</div>
& U" L2 m/ S5 H* a4 | - <div class="def-block ddef_block">...</div>/ V q* _+ y% m7 O! p
- <div class="bb hax">...</div>
# [, z# J* _" M: L T - """
8 L: [3 H! J6 i/ G& `9 a2 j - result = ""
0 {6 h+ @6 Q/ R0 [9 m9 a: P - for def_block in tag.select("div.def-block"):
, C# M6 G5 k5 A - result += self.process_def_block(def_block), i0 f& u& x j: z
- for phrase_block in tag.select("div.pharse-block"):8 J" [* M* d& C* Z
- result += self.process_phrase_block(phrase_block)
m, f/ o* Y0 p/ w: v: Q - return result
0 k, W0 E- ^" K6 |' Q6 w& L: X$ [ - 2 y3 |5 [8 e) N0 V. [0 j8 ]
- def process_sense_tag(self, sense_tag):( }2 F u( j' t% |- v
- """
3 m9 n0 H2 _7 y, U, R! u3 c - <h3 class="dsense_h">...</h3>
" \, a7 p* l6 e5 Y - <div class="sense-body dsense_b">...</div>
, @/ i/ q7 A! ], g8 i8 C - <div class="smartt daccord">...</div> # Smart Vocabulary
2 X; D& X6 I; n5 \! B - <div class="bb hax">...</div>% w6 Q ]9 Q, V" q2 \
- """
( @& Y& `- t7 P+ y' _+ W6 G - result = ""+ j- J$ H- k: f% {/ v* O8 [
- sense_head_tag = sense_tag.find("h3", {"class": "dsense_h"})
$ a5 v8 x) P; g9 p4 ? - if sense_head_tag:
$ d) q- i+ K! }: O4 k - result += self.process_sense_head(sense_head_tag)
. \6 X+ ~' H2 {4 f7 n: d - for sense_body_tag in sense_tag.select("div.sense-body"):
% ^6 I) |# L# Z! a9 r+ g5 y - result += self.process_sense_body(sense_body_tag)5 o$ }) U: {% _
- for smart_vocabulary_tag in sense_tag.find_all("div", {"class": "smartt"}):; y, x& U: @7 L7 l5 k. ^( p" }" [
- result += self.get_smart_vocabulary(smart_vocabulary_tag)
* L u3 K( r: I0 ?4 T0 @ - $ ?- O& m$ I9 t# I2 j9 P) B, O: W
- return result
" |; t0 r! U, Q# s, U( a - $ o2 |' v( e6 d! t
- def process_part_of_speech_body(self, body_tag):
/ R" Q" @; u- n8 l* D - """
* G1 H$ A' b( E. i6 ]/ I0 `$ j - <div class="pr dsense">...</div>) M* f' B& d* u5 ~# P+ X
- <div class="pr dsense">...</div>
( `! D: P6 z, Z7 P$ U5 R/ ? - """( l f/ |" U: U- P; O3 K2 U' d+ }
- result = ""7 N/ d$ g8 E: i
- for sense_tag in body_tag.select("div.dsense"):
9 w% }! a3 x: y6 K' {* G1 J; [ - result += self.process_sense_tag(sense_tag)
1 H( _; o# r& o+ P - return result
- ]; N1 `) s4 |5 x" { - $ Y' j: I3 Z& ?" E" m: i' K y8 D+ a
! E, P/ b5 l% W) e5 b; c; f- y) C- if __name__ == "__main__":6 G: [6 s4 ]/ ^. x" t! |* I H- O
- string = ""9 s, Z+ N3 R- ~: A8 J0 U
- CambridgeDictionaryScraper().start(cambridge_english_chinese_url)
2 \6 ?6 m7 \8 S. q$ @, [" `1 n - for row in conn.execute("SELECT entry, url, html FROM cambridge;"):
; H- s9 U7 s) j9 h - entry = row[0]3 d1 z" G Y# X3 S b
- print(entry)1 Y! S# M, t( Z4 @9 t% D# ^
- url = row[1]/ E/ D' ]" ^ D, {+ v+ b
- html = row[2]/ k* y% L% @: V: g! n0 w
- record = CambridgeDictionaryExtractor(entry, entry_html=html); D4 p0 c9 ?3 s0 g7 W; j
- record.extract()
0 @; W: z! P( G) I - string += record.result + "\n"# i% ~! p* T8 {$ }
- #print(record.result)- w% G! U+ ~3 a$ i9 [
- + _; {4 |1 M% }0 F ]
- with open("./final.tex", "w", encoding="utf-8") as f:6 K% _5 A: U9 D, {
- try:. \$ N B( ~; i
- for char in ascii_lowercase:
6 A. r0 `( a6 m5 k8 C! ` - string = string.replace("\\begin{entry}{" + char, "\\section{" + char + "}\n\n\\begin{entry}{" + char, 1)
* C/ v! L( }: w" g4 f. T% d9 u0 a - except:2 Q" n( ]3 O! B* T3 a
- pass
; z* K% @1 Z0 y -
' n$ L+ N9 M$ e - f.write(string.replace("", ""))/ Z8 V: v w, H( U! t6 w: A6 K
9 W+ i' V7 {- P% I5 J
1 ~- \/ Y, g9 F-
复制代码 |
|