TA的每日心情 | 开心 2021-1-4 22:53 |
---|
签到天数: 5 天 [LV.2]偶尔看看I
|
发表于 2021-1-4 02:18:57
|
显示全部楼层
- #!/usr/bin/env python3
3 o+ p, Y) c' R1 t6 i- K - # -*- coding: utf-8 -*-' o$ n Z9 v" n- X
- , G2 [$ l( V+ d e1 I, w6 e
- import os% K; \4 Y; Q8 x# `4 S. V7 V
- import requests
1 A0 ?7 ]; }' S6 @$ |- h+ c - import re
2 a9 z2 G0 j% ]7 a2 P# Q8 q - import time
# ~4 b4 ]/ \+ W) O5 F+ W4 D, q: F - import sqlite3
0 y- s; M, X; O* K7 O - + F7 B1 l, l7 s/ K. H& q
- from string import ascii_lowercase % @. v8 {# Y6 W# }. L
- : r3 m2 ^" _% W# M- }% `
- from pathlib import Path7 e: A+ m+ D3 h% o s& t9 \: _
- from urllib.parse import urljoin
+ n4 c; R4 {0 x, S( {+ t& z3 U - from html.parser import HTMLParser+ E( B# @) ~% o. D* X
- from bs4 import BeautifulSoup
% b# Y. @" ^- ?' e5 ?8 @ - $ [$ v2 P+ ?2 `5 u" `9 n
- webster_url = "https://www.merriam-webster.com/browse/dictionary/"* B% i7 A" {- X( w) @( ^- Q; ~
- oxford_url = "https://www.oxfordlearnersdictionaries.com/browse/english/". ?+ T/ z K! O4 G }
- macmillan_url = "https://www.macmillandictionary.com/browse/british/"7 s& w. q1 a% U! R
- cambridge_url ="https://dictionary.cambridge.org/browse/english/"
0 z4 C7 ~7 n1 @) e; ?
1 ?, ~* A6 u; z' X5 y- }- cambridge_english_chinese_url = "https://dictionary.cambridge.org/browse/english-chinese-simplified/"
" G4 _5 j. Y$ q! a7 l - 3 J/ n+ e# X2 d) |: g5 h6 z
- base_url = "https://dictionary.cambridge.org/search/direct/") n+ S2 c* T1 m- }( Z1 \0 a
- header = {"User-Agent": "Chrome/87.0.4280.88 Safari/537.36"}
+ v$ z: E8 U: O7 q - payload = {'datasetsearch': 'english-chinese-simplified', 'q': ""}% `. Q& ]* z+ J% T# f# u8 G
! `4 H0 Z) W0 |' S" Z* S- conn = sqlite3.connect("Cambridge Advanced Learner's English-Chinese Dictionary.db")3 @. ]/ z% H! ~( E( j9 b& U
1 ?. z8 n$ Y |- conn.execute("""
; `' U* O7 C8 ~8 r$ u0 O N - CREATE TABLE IF NOT EXISTS cambridge (
- K0 `$ `! ?% n: z/ [1 h - entry TEXT PRIMARY KEY
6 p& N4 }- D# p# o1 C* E% L- s3 | - UNIQUE
; v- Q) @; x# R1 K3 t6 P2 T( D, X - NOT NULL,5 R; Q, p; s' Q* f2 @' f9 E: E/ ]
- url TEXT NOT NULL,) M, y7 ?+ w+ U# |/ _6 ]
- html TEXT NOT NULL,* X. V( V9 H# I! v
- latex TEXT/ z/ ?) B0 ]# P. @) y) n' w
- );3 ]7 N- N2 I) u; Z
- """); `" `7 @6 I& f c1 `8 w
& K2 c3 c( F' n& b- \- conn.commit()
6 B1 S* A s. c- g1 {' X0 s
$ g. X0 | S* h% h( W6 b& Z5 p- 8 Y. G) Y2 P A$ S2 d' y
- def latexify(string):" n g# R% J0 X. u' s
- result = ""
+ z1 I6 J3 e5 v" M" p3 [, H - trimmed_string = re.sub(r"\s{2,}", " ", string)2 B/ W: _4 |3 \, s5 O. K" Q. [
- for char in re.sub(r"\s+([^\w'"({[])", r"\1", trimmed_string):
2 [2 o# h. l/ O+ H - if char == "%":
7 J! e9 ~1 r4 L# o% r2 @ - result += r"\%"( B, S- ?6 Q& D6 H; ]/ _( U$ S; K2 ?
- elif char == "\":9 g' P+ c& _: K' }
- result += r"\textbackslash{}". V' s; Q* y3 E% y& n
- elif char == "$":) u" ~: y5 O0 b; }8 s
- result += r"\$"
+ } T* n, I- z8 {8 K - elif char == "#":5 y" w/ t( M* T: D' z
- result += r"\#"1 k" ?2 a/ D" ]1 |7 ~/ O
- elif char == "&":. @# k6 c+ _: z, k
- result += r"\&"9 B" Z2 S* }7 d
- elif char == "{":
! i3 s. d2 _' x* `$ k - result += r"\{"4 H$ G( d P4 g
- elif char == "}":& f0 }6 Z- H. T) v4 F$ L [: ], v; }
- result += r"\}"
/ P8 v A- o4 B2 h3 S& W1 v6 G( g& g - elif char == "^":
5 D" I7 W4 y8 M/ W' F2 K+ b3 ^+ d - result += r"\^"0 \! o" o( I2 ]$ {0 d S7 S# X: ^
- elif char == "_":
X4 Q P# w" G( C( x2 K4 f - result += r"\_"& v# `: I5 Q3 h! i, r, T! ^
- elif char == "~":
2 r# s* v v$ L+ E$ e. i - result += "r\textasciitilde{}"/ E( F% ^. Q8 M* `+ M+ Z1 @
- else:
4 P+ w, f$ p, x' Q2 e - result += char
/ w' [ W z8 b" {) I ]; l - return result
8 z( X/ d M4 z0 L% Z7 K- v$ K
- j' Q1 f# x7 S" v+ {, @- def latexify_html(beautifulsoup_object):" ^! b4 [5 J; g. j" [
- try:
$ c4 n9 Q+ n! S, ]& _) K - return latexify(re.sub(r"\s{2,}", " ", beautifulsoup_object.get_text().replace("\n", " ")).strip())
8 s* m) c$ U+ W; v' g2 ^0 r - except:
( ] k3 u$ M" i) ]; ?6 ?2 _ - return latexify(beautifulsoup_object)
) |4 p& i2 @! _! F" z - 2 t# g! d$ I, V# w. x! A
5 f ~4 s' O$ h1 `0 _- class CambridgeDictionaryScraper:
) y" h$ T9 ~2 D) N, b! k - """ Scraper for Cambridge Dictionary """
2 y& D0 U: A; E" B O - url_set = set() ## Shared by all instance% k( A- W9 i" C7 F1 P/ i* j( M
( d, S1 M" a$ S& |- def __init__(self):' B2 o- L' l- I1 j% R+ K
- for item in conn.execute("SELECT url FROM cambridge;"):# R, g9 \; E: Z8 G9 z4 N2 l) X
- self.url_set.add(item[0])
1 l& g) `# b6 H - print("Already Downloaded " + str(len(self.url_set)) + " Words!")
; `9 O X6 a4 g - : \/ {$ J6 _/ L. M% o9 A
& }( N$ I+ y1 ]4 i- def __del__(self):
1 @! m+ h: v7 P7 J7 D6 K9 D0 ~ - conn.commit()4 e# @/ L- `/ h' Q4 | S; ]
- + A: a% R; n: Q" k+ J
- 4 w, J. _: c0 {1 c
- def get_word_page(self, url):* f/ _# ]7 S) q$ H" d7 q9 m! ^
- r = requests.get(url, headers=header)
# j+ {2 e) ^# E& i* U" { - bs_obj = BeautifulSoup(r.text, "html.parser")
# w( C2 {( _4 n8 h+ t8 `# f - entry_tag = bs_obj.find("div", {"class": "entry"})4 }% h3 u5 a; B4 X6 u; @
- if not entry_tag:
" J! C( U* f2 T0 U! S - entry_tag = bs_obj.find("div", {"class": "di-body"})
% T O3 w$ V. j2 V1 x, c - if not entry_tag:0 `$ n# l0 ^2 y$ E Q
- ## Beta Words$ R( o- {/ V: ^$ A* F
- entry_tag = bs_obj.find("div", {"id": "entryContent"})
$ _: i b9 |! L' a+ T9 h - if not entry_tag:
. F0 g& j, p5 \& A& X: x5 I$ t - entry_tag = bs_obj
% Z5 L. \- l- S8 }: M5 z( Z - ! P# I' K# b, e* A+ j4 T# s
- if not entry_tag:: s& ?4 K$ i, T2 Q
- entry_tag = bs_obj2 O( ?1 F2 h3 l- M4 ` F' ]
-
' m6 f& A( r. w - for tag in entry_tag.find_all("script"):$ f" _! i/ Z( `" s! K
- tag.extract()
5 x* B$ G# d& w0 }4 J -
5 u6 A! O% H9 N' |5 p( Y- W - result_string = str(entry_tag)9 t2 }$ d7 S5 ~% ?% m/ |8 l
- return result_string5 K8 s3 Q: J8 k7 F' \; S4 W3 K" y
5 U% |7 \) ^1 f [' X& R-
9 U& w" ]& P- \ N y5 Y) E - def start(self, url):; u* W; C8 G$ O9 T/ V/ \9 S5 V
- r = requests.get(url, headers=header)
0 b9 j/ u4 k& H$ }0 O1 k
# J7 Q( N d2 r, e+ T3 i4 D; N- r- bs_obj = BeautifulSoup(r.text, "html.parser")
. H4 X9 G& g3 s
. u: G/ r* M4 i- for li_tag in bs_obj.select("li.lpr-10"):# N7 _4 M/ o. [% q# a8 x: Y7 M6 y
- child_url = urljoin(cambridge_url, li_tag.a.attrs["href"]) G% Z" l" v& d3 ], N% Y/ t* ]- j5 Y
- print(child_url)# E) j* f" X" |: u# ?( c$ D
- self.find_child_entry(child_url)
$ j. b* s, Y/ k6 ]$ B1 Y
" |% {& \0 H7 ]& c8 l
' v; v) D9 P1 N" l- def find_child_entry(self, url):
- \% E a7 ~$ O+ a - r = requests.get(url, headers=header)) u& X( \& n3 ~% c
- bs_obj = BeautifulSoup(r.text, "html.parser")8 h- U2 O2 w4 Y: J* e0 n0 O; N t
- for li_tag in bs_obj.select("li.t-i"):
* h9 a2 |: b7 D2 `% d/ ~- \3 A - child_url = urljoin(url, li_tag.a.attrs["href"]).strip()6 W! ]" b- @2 \3 I1 G+ i1 R: J
- child_text = li_tag.get_text().strip()
0 _$ n: I8 N. s3 g - if "..." in child_text:
6 k6 s) k9 v( B* ^: ]: e - self.find_child_entry(child_url)
) ~1 s0 z6 H1 ?8 A5 T4 r - else:
9 ?# X( a; H, w" C0 ~2 J" E7 i/ b - if child_url in self.url_set:
0 z- Z, e! i: K z - continue# F- r0 \% C; X; w5 ]! I
- print(child_text + "\t" + child_url)) C3 x4 D' l& ?- G; z
- conn.execute("INSERT INTO cambridge (entry, url, html) VALUES (?, ?, ?);", 6 s$ A% A7 ^# l I, a1 Q
- (child_text, child_url, self.get_word_page(child_url)))% b9 Y$ k5 Y1 a% x7 y X- L
- conn.commit()
" W/ n. w4 A/ l2 i2 ]# P% j - self.url_set.add(child_url)4 T& a u: n* F0 B# l9 y) S. o
- 0 c+ X$ }- `' R6 `
. Q$ D# { h3 ?# I0 ^( @$ k1 v5 e- class CambridgeDictionaryExtractor():( Y; y+ P% K+ l( B2 W/ T6 v
- def __init__(self, entry, entry_html = ""):
: T7 \. n! v2 o( q4 G+ ?3 ? - self.entry = latexify(entry)3 L9 {6 ^/ p1 h; A9 G r
- self.entry_html = entry_html9 p$ ~+ ^8 H2 X+ p7 E( b$ W4 R* K
- self.result = ""$ V/ \) v0 ^% E0 P- F% D
- 0 T5 g' G5 m0 F* `; J+ A8 a
- # def __del__(self):: s' E& A2 B% f1 v K! q9 z
- # pass+ o7 ~& H+ R& a) N( m' O( N5 z; W2 u
' `6 @4 @ t8 H0 n) \; S0 P" I
! v" Y$ K; R: Z# N2 U8 ^- def extract(self):
2 A7 y# h$ c0 R3 z. E0 s - """8 h# ^+ M3 t" u$ U9 b
- <div class="pr idiom-block">+ w5 B, B# n2 q& k9 Z# v
- <div class="idiom-block"></div>1 ~: W7 E+ T6 O% d1 a
- </div>
5 o0 m- x. R8 `. q. I' `% @& v - """
. p" W- H# j! N5 N @* t/ _' t - bs_obj = BeautifulSoup(self.entry_html, "html.parser")
, F6 |1 h) s4 E: K - self.result += "\\begin{entry}{" + self.entry + "}"$ L# ]& Q# o0 A r3 ]
- for part_of_speech_tag in bs_obj.find_all("div", {"class": "entry-body__el"}):
3 k% U0 _' H3 p$ T9 L - self.result += "\n\\begin{Partofspeech}\n" + self.process_part_of_speech(part_of_speech_tag) + "\n\\end{Partofspeech}"
2 J& {- o& s1 p/ i - idiom_block = bs_obj.find("div", {"class": "idiom-block"}); M% l- T- t3 A* Q8 }
- if idiom_block:- E q, n' Z: f8 m; e% g% I
- for idiom_block in idiom_block.find_all("div", {"class": "idiom-block"}):
5 z0 @0 X' q+ Q, w# f- J - self.result += "\n\\begin{idiom}" + self.process_idiom_block(idiom_block) + "\n\\end{idiom}"% h! }, b8 z: ~, s4 a* {7 x
- self.result += "\n\\end{entry}\n\n". x% J3 C' k; s4 Z* X% V
- i+ b) T% v2 H
-
0 S3 o6 U( V! T, w - def process_idiom_block(self, idiom_block):
# ]+ U! k; U O7 W* K1 z/ k2 [ A - result = ""+ z! \( J* A& J
- idiom_body = idiom_block.find("span", {"class": "idiom-body"})
$ \ ?6 M# W! ]8 t: t - if idiom_body:0 X& W( R4 J6 R9 l5 S
- for sense_tag in idiom_body.find_all("div", {"class": "dsense"}):
+ U3 k! N& n! S/ g0 ]$ e: `* d - result += self.process_sense_tag(sense_tag)9 X6 B9 f* d8 n$ ~ A
- return result
+ w0 G5 {7 e1 k+ E+ a1 {
) o$ ^; k% x5 h! s$ c8 Z" r-
% Y3 x: h" z0 e# _/ A
8 P# S9 L, i( S4 u; D- def get_smart_vocabulary(self, smart_vocabulary_tag):
& G' ^1 t4 O$ s E$ m - result = ""
/ u9 e7 E% ?) x$ K$ l9 }& M# [1 k; } - for li_tag in smart_vocabulary_tag.find_all("li"):% G6 V, b2 W) M1 M$ A( Y+ V1 j
- result += "\\smart{" + latexify_html(li_tag) + "}\n"; y6 e8 ]% |# d: s' R
- return "\n\\begin{smartvocabulary}\n" + result + "\\end{smartvocabulary}\n"
7 c1 J D, O( \. P3 a9 Y* d - / ]& O# u6 E& `8 P6 r: ? m
- ! X3 P. ^" V. d+ @
- def process_part_of_speech(self, part_of_speech_tag):3 C: W8 o8 j1 I, O3 ~
- """
2 M4 s6 [# u1 `( C4 C - <div class="entry-body__el">
, j6 w0 X/ N( e( F$ K% D6 y - <div class="pos-header"></div>
/ k$ _( M2 S% N% m5 h8 [ - <div class="pos-body"></div># z3 \6 J' c$ U3 N, ^3 y+ k3 _
- <div class="pr relativDiv"></div>
8 g9 Q2 G7 ^ \ - <div>: ^2 L5 @8 S& m+ d8 F
- """
- e* `3 c: c1 g* z# [% Y7 r- e - result = ""
& R' L+ N O3 ~ - header_tag = part_of_speech_tag.find("div", {"class": "pos-header"})8 N0 q3 U! V2 o; @4 H
- if header_tag:
1 v# q! g# E( y - result += self.process_part_of_speech_header(header_tag)# J5 N" {$ T1 j) k1 y: l2 \
- body_tag = part_of_speech_tag.find("div", {"class": "pos-body"}): Y2 t( u' {+ l: W
- if body_tag:
8 r' C- c$ u2 ^ - result += self.process_part_of_speech_body(body_tag)# o7 t s% }' s+ G, ~
- pv_block_tag = part_of_speech_tag.find("div", {"class": "pv-block"})2 m* F) r2 O# K% F
- if pv_block_tag:( G% @. Q4 ?6 C8 [# m
- result += self.process_pv_block(pv_block_tag)
0 p* a6 }2 N! X; i - return result.strip()
% {, J: ?+ o! x0 Q+ a2 V8 Z
( A4 n, M9 n5 S0 y$ K- 1 s8 x# j7 H0 K
- def process_pv_block(self, tag):
4 |* `( `. p& f5 c" m: v - """4 P7 I' Z( p W! R9 {- g
- <div class="pv-block">9 f& D5 @# m1 j" d* H% i
- <div class="di-title"></div>
$ w3 e8 I" D4 k6 [8 P; m- O - <span clss="di-info"></span>
( y" b- F4 l1 B+ v5 t0 F0 J - <span class="pv-body dpv-body">
- ^0 h- a! D L% c* g - <div class="pr dsense dsense-noh">" n* f$ h3 X8 r: k$ Q/ _
- <span>
( y' q9 z, J/ K8 `- ?* J - <div> x4 b! {0 n. J
- """
) K* \' Z7 q& f0 f1 M - result = ""0 P6 s6 ~, ~/ z( ^% L0 E
- for item in tag.find_all("div",{"class", "sense-body"}):
& k2 C) U9 A, o - result += self.process_sense_body(item)
' g) ]* p3 z& {1 L7 w. f - return result2 C( c+ p3 ?2 u) T
! q0 ~3 a& B) v4 _- y! D$ |: m3 U- g' f
- ( H' R4 ~ b( Z$ V; L0 v% T1 W
- def process_part_of_speech_header(self, header_tag):- o$ S: k. G# X$ _3 Y' h. F) U, v
- result = ""3 ^4 S& R7 Z/ }0 N T( d
- # title_tag = header_tag.find("div", {"class": "di-title"}): Z! X5 r" o! h& D3 V9 ]
- # if title_tag:! U# ^3 U; b& j
- # result += process_header_title(title_tag)
4 j1 Q3 V) N& H, v' G$ G! o - posgram_tag = header_tag.find("div", {"class": "posgram"})
, P' S- t' e# u1 o/ P5 u8 w - if posgram_tag:( s+ C/ @% S- Y7 ^
- result += self.process_part_of_speech_grammar(posgram_tag)
* f: r# I9 N* d; a% e( ^! m - for pronunciation_tag in header_tag.find_all("span", {"class": "dpron-i"}):
0 z3 [9 V8 a2 j, ?' ? - result += self.process_pronunciation(pronunciation_tag)5 g/ f4 H8 y+ R% t5 ] G* H6 r0 L
7 k- F2 N( |7 [( R% a- return result.strip(): d9 X, a3 w4 V6 O7 K/ W
- V' m$ N8 d+ w+ z0 c
$ Z: y3 @$ U- P. n: f9 N; R- F- def process_header_title(self, title_tag):& P6 e" Q6 n5 C( O. A) }
- ## <span class="hw dhw">record</span>$ O3 N6 V% \4 A
- result = "" ?* ]3 n8 h4 {) Y
- headword_tag = title_tag.find("span", {"class": "hw"})0 ]! d/ c6 F$ V
- if headword_tag:8 h- Y8 b$ `8 w j& [& s+ ^
- result += "\\entry{" + latexify_html(headword_tag) + "}\n"9 R- a- C% k9 ~: l
- else:
9 w, U& g0 J' _, s - result += "\\entry{" + latexify_html(title_tag) + "}\n"* x$ I! i' @* n: |- X6 t
- return result( Z# T5 L }% |5 U' H2 r
9 `/ c+ s, `2 K- r- def process_part_of_speech_grammar(self, posgram_tag):6 ]7 p7 O7 p+ `# X, ]' b z
- result = """ |0 A3 a& ~% Q$ l, g
- part_of_speech_tag = posgram_tag.find("span", {"class": "pos"})! A: s$ }' p* Y" O
- if part_of_speech_tag:
# ]2 U1 K/ d6 H - result += "\n\\pos{" + latexify_html(part_of_speech_tag) + "}"" h( m) Z( M( \
- gram_tag = posgram_tag.find("span", {"class": "gc"})
# r* s0 ]! `9 q N - if gram_tag:
2 O; @5 c; Z+ g8 N - result += "\n\\posgram{" + latexify_html(gram_tag) + "}"1 ~; L U' O! G! I1 p5 L
- return result& f/ z$ v7 e% f: e4 X
/ b4 V: J- r% S" a1 D! h( j2 m6 O- def process_pronunciation(self, pronunciation_tag):
8 R: ^: x4 n3 U% r- G - is_us_pronunciation = False
0 D/ W/ p' g+ ^, A+ f3 G5 r, ^ - if "us" in pronunciation_tag.attrs["class"]:# K7 C( L/ [1 O2 E# z8 K
- is_us_pronunciation = True
' c8 o; z2 d v0 E+ ~# A - result = ""- z" \2 I4 N/ y: f: |$ p E' a
- audio_tag = pronunciation_tag.find("source", {"type": "audio/mpeg"})% I$ j, @. r( K$ }# S" h+ |
-
+ V3 x6 V* W; [9 w" G; {/ H' I - ipa_tag = pronunciation_tag.find("span", {"class": "ipa"})
6 Z( _. {8 m) O9 K; e- O( _ - if ipa_tag:9 Q% ?! F5 I6 j9 \' h$ T7 N
- if is_us_pronunciation:( r: p* A% ]. I; l( O" P" }
- result += "\n\\ipaus{" + latexify_html(ipa_tag) + "}"+ {) X/ t2 }0 x* h
- else:6 }2 S5 X# p, Y) l2 @
- result += "\n\\ipauk{" + latexify_html(ipa_tag) + "}"
% t$ G/ H' L. g - if audio_tag: F- t6 A. e8 T1 R* a6 w
- audio_url = urljoin("https://dictionary.cambridge.org/", audio_tag.attrs["src"])
$ f* n0 z, z) I- D1 ]& R2 p1 ~ - result += "\n\pronuniation{" + audio_url + "}"
: B8 o& u7 q4 s: N- r2 P9 n7 W - return result- V4 V5 \( u" c0 w
[& B2 B( X/ C" Q9 j7 S
7 I4 v$ z/ T8 M) y8 Z
4 k. e9 `* r8 P) c- x* d7 Q" y2 v- def process_sense_head(self, tag):& n$ j; y& n" D, @ o
- text = latexify_html(tag)
; r" E6 O6 a: v, X - if "(" in text:3 q& b( r( B: g0 _& t
- left_bracket_index = text.index("(")
! z8 p2 o! v& O* ?$ t" [% @ - else:
4 H# F( Z4 y5 X# M. X& ] - left_bracket_index = 0
2 m' @& Z+ G& J) y - if ")" in text:
- a: e, `* X" L$ X: O/ J - right_bracket_index = text.index(")")" Y/ {3 Q% V7 p1 P. D
- else:
2 a, |1 Z" f+ ^" U5 `6 z9 ] - right_bracket_index = len(text) h* X6 \, ~/ ?9 O0 c1 B" G" a
- return "\n\\shortmeaning{" + text[left_bracket_index + 1: right_bracket_index].strip() + "}"
( E9 s* z1 Z+ I! N' t* V# ^ - 4 b0 u; B7 w4 j4 }( I
- , s% [9 B1 j* x5 ~+ \4 k E
- def get_definition(self, tag):
$ }; F/ Z+ ^* j - result = ""
: d' h1 O. J& ~ - for def_info_tag in tag.select("span.def-into"):: O5 r, H* Z4 I2 Q
- result += latexify_html(def_info_tag)
7 F) R; T1 s; O& E4 V. E1 D - for def_tag in tag.select("div.def"):
' e+ Q9 N" f9 w - result += latexify_html(def_tag)
$ A6 [3 O% B2 o4 h: m. m - return result
- ^' d0 Q! {; p
7 H$ H( N V7 w) x$ ?6 ]- : q+ m5 _$ H/ ]6 z
- def process_def_block(self, tag):
2 O. F. ^( F( W& a3 n - result = "", E$ J# e7 Z; ]+ n
- def_tag = tag.find("div", {"class": "ddef_h"})+ B0 X& }5 \, }& F0 `2 \5 q
- if def_tag:
; ^/ Q( R7 _, W( F1 c# } - result += "\n\\meaningen{" + self.get_definition(def_tag) + "}"6 m( ]) d [& {4 q' G5 P. O8 F
- try:) S6 t5 s9 s9 j$ e* Y# y Z
- def_trans_tag = tag.select("span.trans")[0]
* i( _7 w2 [- y6 c - if def_trans_tag:$ j8 o7 h$ a7 c( V
- result += "\n\\meaningcn{" + latexify_html(def_trans_tag) + "}"
/ l2 M/ e! c E# J - except:
% z2 h% j6 }- P3 I( M/ S8 ] - pass
2 D" C( T' q5 u - for example_tag in tag.select("span.eg"):
' W- O7 g) A+ T6 W# s8 k) y - result += "\n\\example{" + latexify_html(example_tag) + "}"5 [2 R. _9 z/ q$ O7 l% a+ Z
- return result. V7 ], M2 s$ |! i8 F
4 M# F; {+ W, k# q) N) W/ } ?# n
o. O% Q+ ]0 |( n% y- def process_phrase_block(self, phrase_block_tag):
. O4 h% |8 I5 V, Y7 m - """
1 P6 a0 n9 }; @$ q: l$ s - <div class="phrase-head dphrase_h">...</div>* M8 |. g$ \0 w1 Y7 k6 V/ H
- <div class="phrase-body dphrase_b">...</div>$ h4 M) N4 k8 [
- <div class="bb hax">...</div>
% [+ H; C; Y: q0 Y6 n% ] - """+ P V) }: L$ A. o- ^
- result = "\\begin{phrase}{"' k9 N B m) r
- result += phrase_block_tag.select("span.phrase-title")[0].get_text().strip() + "}"
5 S0 O5 c+ P1 `4 _ - return result + "\\end{pharse}\n"2 B: e" |- l3 F) a7 V
- 5 R# N& f0 `0 F
- def process_sense_body(self, tag):) q( w( f. @! y7 P6 l0 E5 y
- """3 M: z8 v) r T- e7 T
- <div class="pr phrase-block dphrase-block">...</div>
! t- x3 r9 d+ u, h2 d2 c1 Q - <div class="def-block ddef_block">...</div>% {! P( ^# O0 Y. W
- <div class="bb hax">...</div>
# I, m& W7 r& v- R2 h1 M8 G5 ?. l - """
) f' {. a! U4 j( ~& g - result = ""
/ A8 M' j8 V, h) B. `6 d - for def_block in tag.select("div.def-block"):
+ ]+ h. g) z4 C" P9 N! \6 V |, U - result += self.process_def_block(def_block)
, w+ i+ @5 `. q; w O - for phrase_block in tag.select("div.pharse-block"):
Z+ Z% b0 a g( s3 D& o6 u( R# s - result += self.process_phrase_block(phrase_block)
7 q2 l$ I7 V( f - return result2 |7 A, i5 G# z0 a0 j0 G Q* W, s+ R
- 6 I7 h: }8 A) z, j. M; k% _9 X
- def process_sense_tag(self, sense_tag):
O2 g+ V" i" {2 p# N0 } - """. q7 S' ^. s, p7 r2 m9 W v* L
- <h3 class="dsense_h">...</h3>
, ^: W' E, `% {+ V6 q' e: n - <div class="sense-body dsense_b">...</div>! }+ }5 G8 y( l' a$ ^
- <div class="smartt daccord">...</div> # Smart Vocabulary
0 U$ A1 Q8 D3 g2 f - <div class="bb hax">...</div>8 V, u3 w7 T* G
- """
9 ?9 D ?4 z2 x0 A - result = ""
8 B5 F7 L5 c: Z7 z4 o# w/ v! }0 @ - sense_head_tag = sense_tag.find("h3", {"class": "dsense_h"})4 ?1 `& E4 x, t* n: V* u% r: f8 T. Z
- if sense_head_tag:
2 s; A) ]. F+ y B/ p9 `3 W& M* T. s - result += self.process_sense_head(sense_head_tag) A# b. H" ], j* f
- for sense_body_tag in sense_tag.select("div.sense-body"):
# w( @/ S$ j3 e8 b6 B8 b* m - result += self.process_sense_body(sense_body_tag)
2 Z1 _ n# E. b, M l - for smart_vocabulary_tag in sense_tag.find_all("div", {"class": "smartt"}):
& j! M6 J! G Q5 I' j. ]# b - result += self.get_smart_vocabulary(smart_vocabulary_tag)0 Q7 r# T! `5 z8 K8 g
- 6 ^2 W( `8 u+ U/ N5 h. U& ~
- return result7 A0 {; z J' f! b G
% u- Q) [6 z/ P" l7 w$ V2 L4 t; q- def process_part_of_speech_body(self, body_tag):
4 w' _5 O% q) v, l% H4 o - """7 w0 N) m/ ~8 R8 I# q* n& Z
- <div class="pr dsense">...</div>
9 |3 {0 z7 N' C, _. @: z - <div class="pr dsense">...</div>8 E, {& g s! } ^5 U& x; g
- """3 h* f; S) R- N0 Z4 w- H b6 g4 u
- result = ""- t/ t3 n0 y) V3 m# v( N
- for sense_tag in body_tag.select("div.dsense"):
' \6 @. y/ h( A6 j9 i - result += self.process_sense_tag(sense_tag)
$ L D0 }* c1 Y1 Z, b: d7 H! J) q - return result D: ?) u8 J8 d$ r! {4 G
-
' u' w3 S* Q- O3 \5 u. ^' M2 a7 n a9 X - + b& y/ Q& k, \/ M, ^( b1 N
- if __name__ == "__main__":% `/ F1 D/ u7 z3 N$ a: j ?
- string = ""( s2 Q$ h" s0 |9 [
- CambridgeDictionaryScraper().start(cambridge_english_chinese_url)( A9 V( F" D0 }" @7 y
- for row in conn.execute("SELECT entry, url, html FROM cambridge;"):3 G9 d; y8 c/ V9 v9 R+ n" E+ w
- entry = row[0]
$ R- U8 j+ i2 G' C- X - print(entry)
0 V i: [: E1 h# O; H3 ^% d - url = row[1]: f, U( v; [( M: k. I5 @9 \7 i) z2 N
- html = row[2]/ R$ `, O3 L8 j, P: G% K) B& ~
- record = CambridgeDictionaryExtractor(entry, entry_html=html)
f) x, U2 e7 W( h, E( N - record.extract()
- n ^& z& H8 b4 [. F# J, | - string += record.result + "\n"
6 O$ p/ M' [* B' E) U - #print(record.result)% m% q# }. J8 @0 n
- " P1 f1 X+ i7 q9 l9 U) f
- with open("./final.tex", "w", encoding="utf-8") as f: s8 e1 p u0 P
- try:
3 p; x: S1 d8 T) ?6 x; F* t0 T8 P( C - for char in ascii_lowercase:
, e& g9 \. f1 H: N8 R) ~& ~8 n - string = string.replace("\\begin{entry}{" + char, "\\section{" + char + "}\n\n\\begin{entry}{" + char, 1)
: j% B7 A S8 X1 I - except:
& O6 j, \& y, ~9 m+ p; m9 b) ?% V - pass
' T& ?: v; g. f/ } - ! g* Z; D( @# O. J$ j
- f.write(string.replace("", ""))
$ W* o- `$ @# G) B% t7 O( e) T! r% Q
$ @6 `, m- S4 i! o- 1 o6 |( i# B7 W; o- [) ?' F
-
复制代码 |
|