TA的每日心情 | 开心 2021-1-4 22:53 |
---|
签到天数: 5 天 [LV.2]偶尔看看I
|
发表于 2021-1-4 02:18:57
|
显示全部楼层
- #!/usr/bin/env python3
: M1 |' j: B; m, R" Z' p - # -*- coding: utf-8 -*-3 m) j! K L4 K
. G9 l0 j: F; X# x$ M- import os
, b& Q. H2 ]9 G. E7 x0 a+ U$ e - import requests; e2 |* C4 V# A
- import re' l. T; T% a! c! w* ~
- import time
9 s% y: B! o, M7 Y. Z - import sqlite3
1 n7 D& u# q- y2 {8 M
9 k. l/ v7 i; O& l9 B* f: u& g- from string import ascii_lowercase 8 p1 }2 A3 X5 c3 K2 p. Z: f b
" p* B* g; X; l9 u" m9 ~1 d- from pathlib import Path
/ }2 D- p$ D* w1 l9 i$ a( h; ^ a1 Q - from urllib.parse import urljoin8 t' ?" |' o0 `- w [% ~6 l
- from html.parser import HTMLParser
+ |1 u6 ?4 [5 r$ A( t3 O - from bs4 import BeautifulSoup ~! S( [# Y# m$ N. W( h
- 6 @- `5 J/ G) h7 V$ r$ A
- webster_url = "https://www.merriam-webster.com/browse/dictionary/"
# A2 u! @( N( n! Y4 t - oxford_url = "https://www.oxfordlearnersdictionaries.com/browse/english/"
1 k& E; h& I& B. [; p6 W4 D- Q - macmillan_url = "https://www.macmillandictionary.com/browse/british/"
7 t+ H2 U& Y: l6 d! v8 o7 J! {$ H - cambridge_url ="https://dictionary.cambridge.org/browse/english/"
- Q( L( k2 H. n/ R$ R2 @- B" s - 3 J, H* n- r" |% |3 \% u
- cambridge_english_chinese_url = "https://dictionary.cambridge.org/browse/english-chinese-simplified/"
) q) h$ R" ?7 s0 v+ a$ ]+ h+ ~ - 8 Y- w' X4 p9 l$ M" D1 \) j
- base_url = "https://dictionary.cambridge.org/search/direct/"
% K' ^! g# J6 J& e0 q( ~ - header = {"User-Agent": "Chrome/87.0.4280.88 Safari/537.36"} ]$ ?5 q, |$ E) m6 T( v+ l
- payload = {'datasetsearch': 'english-chinese-simplified', 'q': ""}! E- W. P6 z# g/ ~3 F ~
- 6 |8 `$ c2 c9 z0 B" L
- conn = sqlite3.connect("Cambridge Advanced Learner's English-Chinese Dictionary.db")" o: ~5 I+ B& z0 \5 Q! ? _
- ' o6 _$ o% @$ B& j4 p/ r
- conn.execute("""' Y. e3 q/ y4 t) F) O
- CREATE TABLE IF NOT EXISTS cambridge (& ]/ ~1 @' U' r& u. ?
- entry TEXT PRIMARY KEY
9 K$ v/ ]( u1 D+ L; C* ^0 X2 v - UNIQUE k4 b% i) J1 L7 X' I/ }
- NOT NULL,& N* o" g" i/ P
- url TEXT NOT NULL,
( {# H0 \6 K% s8 U9 L - html TEXT NOT NULL, [: |3 s* b1 M6 O! M
- latex TEXT
9 b9 U6 \& _; J$ Q" t* u - );/ s5 J8 o' e$ Q5 }
- """)
+ K0 O V& M% F# n; f8 [
9 ^ H1 j, \7 l T- conn.commit()
( T9 H8 O/ S' n* g6 q - ' V% u7 P3 P. H" d p. \
- Z8 d, i( Q1 J) j- E0 F- def latexify(string):
6 h. l6 l6 t6 ?' V - result = ""
@% E; a7 J) H* D2 O - trimmed_string = re.sub(r"\s{2,}", " ", string)
3 A) Q E! z, F. L2 ^. l2 }; T - for char in re.sub(r"\s+([^\w'"({[])", r"\1", trimmed_string):. @ s9 c: i: I: D7 r
- if char == "%":; T N% p- W/ x# z
- result += r"\%"
9 k0 S( } Q B# ^4 |# c - elif char == "\":: E4 A5 M/ @- s" V+ D7 z& e4 x
- result += r"\textbackslash{}"
' ~' L) ]" w6 ]2 _% _! Y - elif char == "$":" E! T+ q5 F, }1 P- V6 l
- result += r"\$"
+ O' Q0 Y7 Q2 z4 [8 @5 P - elif char == "#":+ K8 q+ D( g0 V) n
- result += r"\#"& F0 l; ~0 T1 A4 {9 g4 r: {- M9 h
- elif char == "&":" W- Y. Z5 J1 v( C3 M" A$ k
- result += r"\&"
% O& G+ N2 g0 F( } - elif char == "{":
8 n% u- c) L: {9 N- W+ X - result += r"\{"0 K# k- j# G5 L
- elif char == "}":( @' N) `6 T- v. i: h) J; f
- result += r"\}"
" ]3 e7 t5 u7 p. Y+ k - elif char == "^":' Z7 M K( ]9 X/ N! x; v
- result += r"\^"
* C+ R: P( z8 _. Z+ v( e. Z - elif char == "_":
- H& B* B" q# ^! K4 d - result += r"\_": V" `: u. F& T9 U# N5 R) y
- elif char == "~":
$ M2 M: t$ t4 V& W - result += "r\textasciitilde{}"2 N. @( ^2 I t+ y6 A4 H
- else:
! M- z/ j0 i7 k! B' K3 G - result += char7 x0 K3 Z k3 B4 L
- return result
; t. l( q* @0 O% w9 }7 S+ M
% n' }1 q' |0 B/ z2 h- def latexify_html(beautifulsoup_object):6 E3 w9 D( M" p" A; |- K% N! A8 U+ U
- try:
7 o/ K) M% p" g. Z7 I' y2 W - return latexify(re.sub(r"\s{2,}", " ", beautifulsoup_object.get_text().replace("\n", " ")).strip())
3 }8 Q+ w1 z% m6 q% m0 ~4 | - except:. w$ c. F# ^' ^& {/ q6 i3 a+ N$ Z! X
- return latexify(beautifulsoup_object) [% }$ y. ~6 D# r0 K* e7 }; x z
/ q* G( W! C- T2 Y0 K* U. Z; r- + K- W2 H8 z, [
- class CambridgeDictionaryScraper:
9 a2 E- {6 E {% b* L. g% c - """ Scraper for Cambridge Dictionary """
& {+ Z4 j) G2 @3 ?) r# m! u - url_set = set() ## Shared by all instance" N* B9 l( m, X& e
- - o5 `; K2 h7 y2 F
- def __init__(self):& l& n/ p% s( L$ o
- for item in conn.execute("SELECT url FROM cambridge;"):
5 I: R& ]4 A& L( B/ t( H - self.url_set.add(item[0])/ K& s1 b5 C5 I8 S
- print("Already Downloaded " + str(len(self.url_set)) + " Words!")8 J9 @4 ^1 C5 N% I* b+ G4 J3 c0 p, S
- * s- P7 g$ x# Q* Q0 g
% W; F' C$ i3 w# K- def __del__(self):
3 F' Z- X8 X, d* C+ u - conn.commit()) q4 u; H7 F U- f* V. R! B
- ! k2 l, ?$ r4 [+ f3 u
- 9 S" r9 c3 P0 ]. }0 X
- def get_word_page(self, url):
" B" j& D7 F j1 u2 d/ M( F - r = requests.get(url, headers=header)
: [: \/ i8 K$ B# V. X7 V - bs_obj = BeautifulSoup(r.text, "html.parser")1 _, h+ O- I* O
- entry_tag = bs_obj.find("div", {"class": "entry"})* S& z, [ s, ?; `8 p8 E
- if not entry_tag:6 h9 @- q+ T/ C! F1 j
- entry_tag = bs_obj.find("div", {"class": "di-body"})
( c1 B( J, m. ]) f - if not entry_tag:
. ^5 W, B4 g* v - ## Beta Words
9 y9 P! \5 G, Y; B, A, r/ n - entry_tag = bs_obj.find("div", {"id": "entryContent"})8 ^- r' E, ]$ Q. \7 J2 @
- if not entry_tag:# }6 m" B9 i4 D
- entry_tag = bs_obj! Q8 i5 i8 z. `/ ?; b
- / r/ x* n% |# S, m8 }
- if not entry_tag:% k* ?: t% s$ A8 _. e4 r8 Q
- entry_tag = bs_obj3 b* b2 M5 ?% u6 S1 M5 d/ x7 w' H. b
-
* d( Q; j/ m" u1 I7 z/ w0 d& I% N: c - for tag in entry_tag.find_all("script"):% |+ p( i* |0 _) B+ g
- tag.extract()
' F/ Y+ C1 @+ h" N - # ~* i% u# \3 Z+ J. _
- result_string = str(entry_tag)
: N1 |+ V9 q6 b. Q& g - return result_string
) B1 p. d5 Q+ K6 R+ G
# K6 K' y( l9 ^# R# ?-
+ v, I/ t& u9 p) o. f - def start(self, url):* X- A' Z5 h5 d- E
- r = requests.get(url, headers=header)/ @- W( g. L. a8 L
) L) E- t# S, L3 E& y. Q3 _- bs_obj = BeautifulSoup(r.text, "html.parser")
: r: k/ R6 |. a( K' i - $ u, h# e0 g. _! y/ v8 q9 ^
- for li_tag in bs_obj.select("li.lpr-10"):
8 X: a5 c; u8 j - child_url = urljoin(cambridge_url, li_tag.a.attrs["href"])
7 S' w# U e: V( k8 Q1 o# t9 K - print(child_url)
3 c- u I3 s7 h. v8 | - self.find_child_entry(child_url)8 ^6 g: @! P( r, Q2 p
$ _4 `" F2 F ^+ F B4 M( C8 O- * C0 w- h- c2 g
- def find_child_entry(self, url):. K4 K( o1 W& i9 r- f& ?: O5 T7 |
- r = requests.get(url, headers=header) W- |5 }7 F1 Y, U3 E
- bs_obj = BeautifulSoup(r.text, "html.parser")& i6 [7 {+ T6 u9 y/ [/ M) R9 e
- for li_tag in bs_obj.select("li.t-i"):
% n$ o b' l) X3 v8 H1 Y) a; s- p: ]. r - child_url = urljoin(url, li_tag.a.attrs["href"]).strip()# @6 {' }# @$ ?) l+ h
- child_text = li_tag.get_text().strip()
; o' L, c# c, y - if "..." in child_text:
; I8 N$ g5 f/ K, {9 y+ e2 d& U. y - self.find_child_entry(child_url)8 `! Q8 m+ r! {. y5 Z; r* R3 |
- else:& x0 W4 S6 Y6 O" H$ o; G
- if child_url in self.url_set:
$ }9 L) r4 J Z) W4 b3 y& |! U - continue
0 W' i3 `1 @5 Y O - print(child_text + "\t" + child_url)/ W! ] G2 z+ f |1 ?$ S
- conn.execute("INSERT INTO cambridge (entry, url, html) VALUES (?, ?, ?);", ( y+ @3 L' m1 _$ v8 f" C( E" x. }
- (child_text, child_url, self.get_word_page(child_url)))
4 Z' |& @, a) Z7 E - conn.commit()+ I, `5 I4 b/ I: G
- self.url_set.add(child_url)% G7 O/ I, s. n
- , |$ _9 e; Z" k% _9 D( B& z
4 S" b: S% s; S; n+ o- class CambridgeDictionaryExtractor():' x. B* Q0 a2 m2 _$ R
- def __init__(self, entry, entry_html = ""):6 P+ L8 W6 D6 I: E
- self.entry = latexify(entry)
% }7 `. X1 v* e2 u/ C - self.entry_html = entry_html6 u3 V/ y5 w! |$ e
- self.result = "") Y; k+ h! W3 [, I9 ~$ ~
- 0 y- B! V+ y( ?& e
- # def __del__(self): Z& z. `, v* F) H) ^) ~
- # pass
4 M+ {" L# S" o, q# H7 ]9 h - 2 t1 t: u4 C b) z0 H' F" M
- % G. P% P, G% A) m4 D9 w3 e5 I
- def extract(self):
) [4 ?5 L- z4 h0 p( h w- g - """% m% U. T, v/ B6 d( e
- <div class="pr idiom-block">
" ]. U( O3 E; R- C) S - <div class="idiom-block"></div>
6 |! Z& q$ f5 A8 a. a" B" H/ Z" { - </div>, ^) d1 Z: s( v# |
- """* ?1 `7 h# ?8 _- u2 A# F4 A+ M
- bs_obj = BeautifulSoup(self.entry_html, "html.parser")
( Q# j" T8 P7 d! C5 ] - self.result += "\\begin{entry}{" + self.entry + "}"
a8 { w* ?( i( F. {/ E - for part_of_speech_tag in bs_obj.find_all("div", {"class": "entry-body__el"}):6 b6 _# R8 I) z
- self.result += "\n\\begin{Partofspeech}\n" + self.process_part_of_speech(part_of_speech_tag) + "\n\\end{Partofspeech}"
" u9 J! J& L" F8 n - idiom_block = bs_obj.find("div", {"class": "idiom-block"})
# \8 x* M; q; r6 Y: m - if idiom_block:: {* ]( c ~% ^; {+ g$ r; S
- for idiom_block in idiom_block.find_all("div", {"class": "idiom-block"}):
. _1 v/ y, G6 o# I! B$ o5 [ - self.result += "\n\\begin{idiom}" + self.process_idiom_block(idiom_block) + "\n\\end{idiom}"& i! n+ O v a- A
- self.result += "\n\\end{entry}\n\n"1 C3 v, s( b7 }6 ~1 `$ {; o$ g" s
- % g$ }& s4 H- w
- 6 q7 c) o2 c3 h: n- h
- def process_idiom_block(self, idiom_block):
2 w- z) I9 @6 e5 `7 N4 t - result = ""$ W/ q: c$ I$ w5 }8 E x/ x
- idiom_body = idiom_block.find("span", {"class": "idiom-body"}) _5 O& H3 f3 r0 C
- if idiom_body: M( F" }4 D3 h$ u
- for sense_tag in idiom_body.find_all("div", {"class": "dsense"}):
: V# a2 ~2 o& W2 F - result += self.process_sense_tag(sense_tag)3 {. H3 M6 O7 f4 T% }0 w
- return result1 t9 t- V) J, Z l9 F* z5 e
! i$ m/ |* P5 i/ z- ; d% o' o6 k) z4 J7 R1 H0 p
; V% S# k$ g8 `4 B- def get_smart_vocabulary(self, smart_vocabulary_tag):% B7 w2 w, a; B z, A
- result = ""
3 O, P9 g) U( R& T - for li_tag in smart_vocabulary_tag.find_all("li"):
) O7 W" ?. }$ f/ Q5 x - result += "\\smart{" + latexify_html(li_tag) + "}\n"3 g. E& e* E1 K8 S5 `
- return "\n\\begin{smartvocabulary}\n" + result + "\\end{smartvocabulary}\n"
- L$ x$ c1 T6 V8 U
3 P! K1 b0 u% D; U1 x& k/ v
) g' r" V5 _5 b& P2 z- y% m- def process_part_of_speech(self, part_of_speech_tag):; @8 `( G5 N/ t8 Q" L* S* W
- """
; `3 p; [. v, Q9 x5 l1 t - <div class="entry-body__el">8 D9 ^. x' z7 V4 @8 r/ M& e
- <div class="pos-header"></div>
' R6 i6 r3 w9 c9 u7 S1 t - <div class="pos-body"></div>
, `. v1 p) L7 f! M7 {% F" ~) d - <div class="pr relativDiv"></div>
, Z0 o' x" r9 s$ \* G7 s - <div>, a0 K. |6 O* } N4 n* h' B
- """
9 d9 [( _0 W, j9 a. { - result = ""+ h2 i2 @8 Q0 e2 u+ p( X
- header_tag = part_of_speech_tag.find("div", {"class": "pos-header"})/ K+ P6 E/ H! r: ~! f1 W
- if header_tag:9 }: c2 Y7 r9 ?5 x: v! q
- result += self.process_part_of_speech_header(header_tag)
! c( Q5 \; m" S- C B - body_tag = part_of_speech_tag.find("div", {"class": "pos-body"})
( M, I1 e9 o' z: ^' m( k' `2 Y - if body_tag:/ E+ b* D4 V3 s8 [+ O' C# _
- result += self.process_part_of_speech_body(body_tag)
& e8 n- ?( t' u6 K$ _ - pv_block_tag = part_of_speech_tag.find("div", {"class": "pv-block"})
0 e( W* R0 ?3 `& v I( x! f - if pv_block_tag:) t4 X& Y9 k, c% A+ o% k1 K
- result += self.process_pv_block(pv_block_tag)! u) a( l7 o4 B$ v. E" g# i" J; R
- return result.strip()8 b7 z- l# W7 j \
) @7 c- X0 e: T) R+ }. G$ T' k-
1 e4 Q* U. K( Z1 D! k - def process_pv_block(self, tag):* J7 _, \# e7 ] n, {
- """
3 _1 W9 b% _+ H; } - <div class="pv-block">" G/ `/ F* w) y( b7 P
- <div class="di-title"></div>
* w. N% {4 q: h" O, A - <span clss="di-info"></span>
5 n% E1 k1 \5 N0 `" m; i - <span class="pv-body dpv-body">
8 s5 M2 K- n: G' Z$ n - <div class="pr dsense dsense-noh">. `! R9 ]* V3 X% K6 B2 S% w
- <span>
/ Y4 r5 U. k3 b8 s$ ]1 F' K1 w - <div>
) O% s3 M. O: I - """
1 t" S8 ^ y. j- D9 { - result = ""0 @6 q! e6 N$ ~9 f% m
- for item in tag.find_all("div",{"class", "sense-body"}):
) K$ t# f2 h$ B& f - result += self.process_sense_body(item); J4 l2 Q' ?* n3 ^
- return result
7 V: S7 A$ r- W0 R# }
9 |# S5 F+ @( Y' |6 i4 X
: z6 s/ R6 `' J3 O c" x0 e$ @-
# n0 m( C; I8 ` G. c% Y - def process_part_of_speech_header(self, header_tag):
% G) G( V# Z l' l8 q9 m) O' d - result = ""
; v) [7 S6 J3 g2 e - # title_tag = header_tag.find("div", {"class": "di-title"})- n4 M6 J7 b* c% G! [5 N6 V
- # if title_tag:- n! J+ \/ [' U7 n
- # result += process_header_title(title_tag)( }: [" d8 v8 c! H8 q# `9 }! @
- posgram_tag = header_tag.find("div", {"class": "posgram"})% b' f2 n( e9 z' c" _3 W, n# k: F
- if posgram_tag:
' r4 E) l2 s, w: e3 a( A" b - result += self.process_part_of_speech_grammar(posgram_tag)( y$ i5 E& p$ Y# S
- for pronunciation_tag in header_tag.find_all("span", {"class": "dpron-i"}):
$ Z, @9 A/ {2 o# t/ n* n9 F. K# d - result += self.process_pronunciation(pronunciation_tag)
& A3 v/ {: t. `+ z5 H' x8 |
& j- q% g9 U2 ], \6 T+ T- return result.strip()) M) i1 m. S' ] V1 Q/ H
- - Q3 h0 Q8 x5 h3 r6 ?5 ^' n5 q
- 9 J) f. l6 p2 O, d P
- def process_header_title(self, title_tag):
# N; Z6 N% i; k0 l - ## <span class="hw dhw">record</span>
( m8 z: f/ S8 [3 A" d - result = "", k4 ^; Q" N* g+ c8 i
- headword_tag = title_tag.find("span", {"class": "hw"})
% ~; r5 k. \5 C6 x5 t - if headword_tag:
/ s% g2 T$ y$ v( k3 U: J @0 B - result += "\\entry{" + latexify_html(headword_tag) + "}\n"1 @$ p* R' @/ Z
- else:3 ?3 G, m4 j2 L7 e2 ]" n
- result += "\\entry{" + latexify_html(title_tag) + "}\n" i, r( _9 c: o/ Y; L
- return result! N% b# E3 D( g) i
- 4 A' Y6 n8 l, n
- def process_part_of_speech_grammar(self, posgram_tag):4 y! [5 n2 `% ~# [& z8 B
- result = ""
: Z; }- u6 l' c$ Y% f% E - part_of_speech_tag = posgram_tag.find("span", {"class": "pos"})
. L* S' [7 u2 w - if part_of_speech_tag:& U: `4 W: L0 B+ w+ q, }! w& M; U0 w
- result += "\n\\pos{" + latexify_html(part_of_speech_tag) + "}". W0 P+ l3 v3 [ Q+ u' u; l
- gram_tag = posgram_tag.find("span", {"class": "gc"})
% K1 [& I# Y7 _) p5 | - if gram_tag:
+ o; c3 Q) B* T6 f3 l4 F - result += "\n\\posgram{" + latexify_html(gram_tag) + "}": Q: I1 \- S- K
- return result3 P9 V+ Y; L$ v, G$ {7 H+ j
% K/ d4 [& y5 Q" E$ ?- def process_pronunciation(self, pronunciation_tag):5 @% {8 x% K. Q- ~1 W3 `* x+ G
- is_us_pronunciation = False
4 E& ]) L1 z' W2 z5 r - if "us" in pronunciation_tag.attrs["class"]:
8 F) }& b0 ]9 g" d% l! I$ c8 T$ O% T - is_us_pronunciation = True1 l1 D% f$ z: ]) t" x
- result = ""9 \: L/ n4 D A/ H% n2 k% F
- audio_tag = pronunciation_tag.find("source", {"type": "audio/mpeg"})! X3 j4 u" a2 Z9 [0 a
- / ?3 w5 ~7 R6 M( L: k9 S' O
- ipa_tag = pronunciation_tag.find("span", {"class": "ipa"}), ]4 k3 O, w+ C. C o: A
- if ipa_tag:
8 A" D+ A# A- Z7 n h* m& E+ n# R, u: F - if is_us_pronunciation:
6 W: M l# B1 R2 d$ ]5 A) }' \ J - result += "\n\\ipaus{" + latexify_html(ipa_tag) + "}": v0 \5 E+ M. r }6 `% k Z
- else:9 }. q' E D% {: p* ?8 v, f
- result += "\n\\ipauk{" + latexify_html(ipa_tag) + "}"
4 h7 l* I+ m. ]9 B - if audio_tag:
9 N# L0 r2 n) b7 c/ O2 R8 f - audio_url = urljoin("https://dictionary.cambridge.org/", audio_tag.attrs["src"]): J4 x4 J6 m6 i% |: `1 J. \ b- z
- result += "\n\pronuniation{" + audio_url + "}"2 l9 q: A% z, d0 G; h, V( w
- return result
2 y% @, I8 I. ?+ ~. P0 F - ; |5 [; J1 H" {8 Q1 ?0 j8 }
+ G; i' Y# @9 |6 ]
1 U5 A8 Q6 ^& {. S- def process_sense_head(self, tag):2 Q Q2 Y$ N/ \5 V7 N- q& n
- text = latexify_html(tag)1 n/ F$ U) R* i9 f0 V+ @
- if "(" in text:% S& `7 p' H% Y3 | G0 s. S
- left_bracket_index = text.index("(")
5 ~+ c, s! m( I; n( U q - else:
0 Y3 I8 A$ v/ d0 g7 R C6 z* E( ~ - left_bracket_index = 0
- c% j9 \5 c; X. u& a0 x4 i. k' }" y# ?3 G - if ")" in text:" Q" P( x; S! f
- right_bracket_index = text.index(")"), P* n1 U! f0 c1 B: G! k( n2 t
- else:% a7 |2 M* z5 p0 ~6 ^) g" N
- right_bracket_index = len(text)
7 `; ^3 ]3 K5 _ - return "\n\\shortmeaning{" + text[left_bracket_index + 1: right_bracket_index].strip() + "}"& l7 t+ A N* U' `
- i6 q" k+ t& v! ~- " h! r* G3 S5 P% S) A7 [
- def get_definition(self, tag):
3 r# m' q: e# b5 X - result = ""
! {: N! ~, d( Z7 ?$ a; W - for def_info_tag in tag.select("span.def-into"):( X( Z" N0 D4 V; ~* G5 a) b
- result += latexify_html(def_info_tag)+ L, J. F# }! d7 E
- for def_tag in tag.select("div.def"):
, r0 I/ U D. Z8 J* A - result += latexify_html(def_tag)
! N. r* k0 J* T: M/ k# I$ i) q - return result
6 u( U' r4 P# q' P' v( g8 Y. x w
H9 H7 j3 q' n
+ d; F8 v: B; y' \( g- def process_def_block(self, tag):6 R( y( \" p8 n N' d, R4 a5 F
- result = ""
$ S: O! k5 L# C& ~ - def_tag = tag.find("div", {"class": "ddef_h"})) ^6 h/ m. u }& ]8 T: d
- if def_tag:% Y( l8 |: K( P
- result += "\n\\meaningen{" + self.get_definition(def_tag) + "}"
6 J. O( X. i3 V1 z' b# q' @ - try:7 W8 z9 z- C& R
- def_trans_tag = tag.select("span.trans")[0]
, }8 c! i% b" O6 s - if def_trans_tag:
3 F" v e5 [- C; b- ]9 n - result += "\n\\meaningcn{" + latexify_html(def_trans_tag) + "}"( n/ k! `9 r. i7 h9 z7 o
- except:
' q& ?/ b: P! W# v5 V" j - pass
3 E. d' j) H- A1 E* L - for example_tag in tag.select("span.eg"):
3 M/ u" X, V/ E+ N - result += "\n\\example{" + latexify_html(example_tag) + "}"; J9 }2 V9 f q1 c4 e3 S
- return result
+ D/ g1 y1 f; _ F8 u; `' @; |# ^ V4 [
/ n$ h; `! l! U! E, `% \. g- 9 N; A$ C/ ]* @0 D0 H
- def process_phrase_block(self, phrase_block_tag):
9 J8 p. a/ f& M/ C - """
$ m8 u; h3 I( K4 K6 N" O3 C& Q - <div class="phrase-head dphrase_h">...</div>) ], h; q1 G. A3 y5 B V
- <div class="phrase-body dphrase_b">...</div>
+ |, J3 v/ U8 [9 k( c6 r6 d - <div class="bb hax">...</div>
7 X7 ]( x# ?: d/ v" A+ a' b - """, m' r/ m. T$ f$ _" Q/ ~& E: I
- result = "\\begin{phrase}{"6 N3 t2 Y' y" l, {' Z! \5 R. v$ _, @
- result += phrase_block_tag.select("span.phrase-title")[0].get_text().strip() + "}"1 K$ ?: ?: U& p8 y. E: M
- return result + "\\end{pharse}\n"2 l4 M5 Y$ y; v7 P
- 4 Y% p) E# V; G' E- E
- def process_sense_body(self, tag):
1 J* |; V. r4 S, {% Q$ S - """, L- J# `0 r# ?
- <div class="pr phrase-block dphrase-block">...</div>3 U" Z. a. i4 p! w
- <div class="def-block ddef_block">...</div>1 n) e8 p; w8 Z+ T) S/ z; B
- <div class="bb hax">...</div>
+ g S+ J$ O9 M4 C3 |8 B* V# Z - """ b$ I' o! L7 T% H& O5 j9 q
- result = ""0 z, B3 n! {$ u" e! F- ?
- for def_block in tag.select("div.def-block"):) ], O4 [5 X; I- Q2 r- w+ k8 Z
- result += self.process_def_block(def_block)
/ V! m* m% T0 O6 K" g) x - for phrase_block in tag.select("div.pharse-block"):; O% t% e1 d5 c* h+ j
- result += self.process_phrase_block(phrase_block)* g9 T/ m! U [7 d
- return result
! i9 _6 d+ g, P) H+ z: ` -
. Q' S# M# l. {# t7 K# S' C - def process_sense_tag(self, sense_tag):
' |. x3 l% V6 n. I - """- G( U; K" ~% B' f) p
- <h3 class="dsense_h">...</h3>' L. i3 T! ]( B3 ]* H& W
- <div class="sense-body dsense_b">...</div>
8 N$ w2 j, N1 x' c. B - <div class="smartt daccord">...</div> # Smart Vocabulary
% t6 ]5 o$ ^ t1 X: g* W, {2 m - <div class="bb hax">...</div>% ]! ` |4 Z S4 \+ ^
- """
- R6 t7 h( h8 C. U+ j) Z" p - result = ""4 Y$ L' a$ c' H( r0 {3 d# m
- sense_head_tag = sense_tag.find("h3", {"class": "dsense_h"})% u# R7 l; v3 \
- if sense_head_tag:
7 A" e9 R/ E' K* `' [ - result += self.process_sense_head(sense_head_tag)) h; E/ u' \- j# D& c
- for sense_body_tag in sense_tag.select("div.sense-body"):
6 `2 l6 f* d1 W3 S - result += self.process_sense_body(sense_body_tag)+ {: a2 l }$ _( O% w* b. ^+ X
- for smart_vocabulary_tag in sense_tag.find_all("div", {"class": "smartt"}):) K* l9 a0 E4 d; r+ ?
- result += self.get_smart_vocabulary(smart_vocabulary_tag)! w% K* z2 ], d
-
. p* M4 y+ _( q2 \ - return result' f V+ B2 j9 [# ~3 |3 f, H4 w
- , x2 d: k! {. S# E, v7 @: [: J
- def process_part_of_speech_body(self, body_tag):0 f3 H" s [) c
- """
, {# N' O6 d. j9 F - <div class="pr dsense">...</div>
0 S( P# T8 z* R6 c$ `# W! e - <div class="pr dsense">...</div>, P: _. s% I: S( ]
- """0 ~9 s8 \8 B ?: A" J
- result = ""$ G: { @) S. b: C/ z$ B: I6 [: p
- for sense_tag in body_tag.select("div.dsense"):
$ Y6 x& v; v( a% t+ y& e: X' f/ q - result += self.process_sense_tag(sense_tag)
* q, f. d$ ^3 a- r3 k1 i9 H" i - return result
4 e/ |/ ?* i& w4 l: Y+ { - 3 E# u) g: e o) @ g& T7 y
- ( l8 t$ s# B8 `! z$ i$ O, |. Q
- if __name__ == "__main__":8 o$ x) p0 n* F; b/ K
- string = ""
- T" x6 `7 g4 ?& n2 i) l - CambridgeDictionaryScraper().start(cambridge_english_chinese_url)
! h0 n' _- n- `: A$ M: R - for row in conn.execute("SELECT entry, url, html FROM cambridge;"):
$ _$ `! B& F8 N* E2 x2 v - entry = row[0]7 N" |/ b: P9 }7 _2 `* Z
- print(entry)
5 F. s8 a2 X! B& X - url = row[1]3 J- x. W2 |2 F& ?
- html = row[2]1 [! r% A' H% Z
- record = CambridgeDictionaryExtractor(entry, entry_html=html)
. p# c& \" }: T) F v a - record.extract(), U+ v- E6 h% B$ `
- string += record.result + "\n"
3 K6 v- B9 ^: Y - #print(record.result)
: e" M+ k& O% M2 j
' Z7 _! {" ~# D5 f: J- with open("./final.tex", "w", encoding="utf-8") as f:
) s! u# }6 ^* j7 X( A8 a - try:
, B- {3 ?/ |3 {; S+ T2 r4 s" u - for char in ascii_lowercase:0 S) B, l' z, }8 D! A8 [$ O
- string = string.replace("\\begin{entry}{" + char, "\\section{" + char + "}\n\n\\begin{entry}{" + char, 1): a0 S! c E/ s0 q5 Z
- except:
/ p" R& J# u! d3 }: p: ~, E - pass" w$ n( P+ X4 w5 \2 p0 j! S* @
-
9 G. s: L M, N+ I9 G. O - f.write(string.replace("", ""))- |( ]: e9 @* v' z0 G: ]
, k- g: o( l: _" S8 T, K4 k- ' p& a# j# ?! g: l( v0 R
-
复制代码 |
|