有没有合适的PYTHON抓字典的案例可参考？

stiggg · 发表于 2020-9-13 15:19:03

懂点PYTHON,想学学前人怎么爬字典的。目前只看到BT4BAIDU大大的开源，大大的代码很典范，可是代码四五年了，似乎网站也改版，具体细节的作用比较难揣摩了。不知道还有其它的比较好可参考学习？

毛小驴 · 发表于 2020-9-13 18:58:34

顶起

你去哪里 · 发表于 2020-9-13 22:19:26

我也想知道这个。

你去哪里 · 发表于 2020-9-13 22:19:38

我也想知道这个。

scirem · 发表于 2021-1-4 02:18:57

#!/usr/bin/env python3
: M1 |' j: B; m, R" Z' p
# -*- coding: utf-8 -*-3 m) j! K L4 K
. G9 l0 j: F; X# x$ M
import os
, b& Q. H2 ]9 G. E7 x0 a+ U$ e
import requests; e2 |* C4 V# A
import re' l. T; T% a! c! w* ~
import time
9 s% y: B! o, M7 Y. Z
import sqlite3
1 n7 D& u# q- y2 {8 M
9 k. l/ v7 i; O& l9 B* f: u& g
from string import ascii_lowercase 8 p1 }2 A3 X5 c3 K2 p. Z: f b
" p* B* g; X; l9 u" m9 ~1 d
from pathlib import Path
/ }2 D- p$ D* w1 l9 i$ a( h; ^ a1 Q
from urllib.parse import urljoin8 t' ?" |' o0 `- w [% ~6 l
from html.parser import HTMLParser
+ |1 u6 ?4 [5 r$ A( t3 O
from bs4 import BeautifulSoup ~! S( [# Y# m$ N. W( h
6 @- `5 J/ G) h7 V$ r$ A
webster_url = "https://www.merriam-webster.com/browse/dictionary/"
# A2 u! @( N( n! Y4 t
oxford_url = "https://www.oxfordlearnersdictionaries.com/browse/english/"
1 k& E; h& I& B. [; p6 W4 D- Q
macmillan_url = "https://www.macmillandictionary.com/browse/british/"
7 t+ H2 U& Y: l6 d! v8 o7 J! {$ H
cambridge_url ="https://dictionary.cambridge.org/browse/english/"
- Q( L( k2 H. n/ R$ R2 @- B" s
3 J, H* n- r" |% |3 \% u
cambridge_english_chinese_url = "https://dictionary.cambridge.org/browse/english-chinese-simplified/"
) q) h$ R" ?7 s0 v+ a$ ]+ h+ ~
8 Y- w' X4 p9 l$ M" D1 \) j
base_url = "https://dictionary.cambridge.org/search/direct/"
% K' ^! g# J6 J& e0 q( ~
header = {"User-Agent": "Chrome/87.0.4280.88 Safari/537.36"} ]$ ?5 q, |$ E) m6 T( v+ l
payload = {'datasetsearch': 'english-chinese-simplified', 'q': ""}! E- W. P6 z# g/ ~3 F ~
6 |8 `$ c2 c9 z0 B" L
conn = sqlite3.connect("Cambridge Advanced Learner's English-Chinese Dictionary.db")" o: ~5 I+ B& z0 \5 Q! ? _
' o6 _$ o% @$ B& j4 p/ r
conn.execute("""' Y. e3 q/ y4 t) F) O
CREATE TABLE IF NOT EXISTS cambridge (& ]/ ~1 @' U' r& u. ?
entry TEXT PRIMARY KEY
9 K$ v/ ]( u1 D+ L; C* ^0 X2 v
UNIQUE k4 b% i) J1 L7 X' I/ }
NOT NULL,& N* o" g" i/ P
url TEXT NOT NULL,
( {# H0 \6 K% s8 U9 L
html TEXT NOT NULL, [: |3 s* b1 M6 O! M
latex TEXT
9 b9 U6 \& _; J$ Q" t* u
);/ s5 J8 o' e$ Q5 }
""")
+ K0 O V& M% F# n; f8 [
9 ^ H1 j, \7 l T
conn.commit()
( T9 H8 O/ S' n* g6 q
' V% u7 P3 P. H" d p. \
- Z8 d, i( Q1 J) j- E0 F
def latexify(string):
6 h. l6 l6 t6 ?' V
result = ""
@% E; a7 J) H* D2 O
trimmed_string = re.sub(r"\s{2,}", " ", string)
3 A) Q E! z, F. L2 ^. l2 }; T
for char in re.sub(r"\s+([^\w'"({[])", r"\1", trimmed_string):. @ s9 c: i: I: D7 r
if char == "%":; T N% p- W/ x# z
result += r"\%"
9 k0 S( } Q B# ^4 |# c
elif char == "\":: E4 A5 M/ @- s" V+ D7 z& e4 x
result += r"\textbackslash{}"
' ~' L) ]" w6 ]2 _% _! Y
elif char == "$":" E! T+ q5 F, }1 P- V6 l
result += r"\$"
+ O' Q0 Y7 Q2 z4 [8 @5 P
elif char == "#":+ K8 q+ D( g0 V) n
result += r"\#"& F0 l; ~0 T1 A4 {9 g4 r: {- M9 h
elif char == "&":" W- Y. Z5 J1 v( C3 M" A$ k
result += r"\&"
% O& G+ N2 g0 F( }
elif char == "{":
8 n% u- c) L: {9 N- W+ X
result += r"\{"0 K# k- j# G5 L
elif char == "}":( @' N) `6 T- v. i: h) J; f
result += r"\}"
" ]3 e7 t5 u7 p. Y+ k
elif char == "^":' Z7 M K( ]9 X/ N! x; v
result += r"\^"
* C+ R: P( z8 _. Z+ v( e. Z
elif char == "_":
- H& B* B" q# ^! K4 d
result += r"\_": V" `: u. F& T9 U# N5 R) y
elif char == "~":
$ M2 M: t$ t4 V& W
result += "r\textasciitilde{}"2 N. @( ^2 I t+ y6 A4 H
else:
! M- z/ j0 i7 k! B' K3 G
result += char7 x0 K3 Z k3 B4 L
return result
; t. l( q* @0 O% w9 }7 S+ M
% n' }1 q' |0 B/ z2 h
def latexify_html(beautifulsoup_object):6 E3 w9 D( M" p" A; |- K% N! A8 U+ U
try:
7 o/ K) M% p" g. Z7 I' y2 W
return latexify(re.sub(r"\s{2,}", " ", beautifulsoup_object.get_text().replace("\n", " ")).strip())
3 }8 Q+ w1 z% m6 q% m0 ~4 |
except:. w$ c. F# ^' ^& {/ q6 i3 a+ N$ Z! X
return latexify(beautifulsoup_object) [% }$ y. ~6 D# r0 K* e7 }; x z
/ q* G( W! C- T2 Y0 K* U. Z; r
+ K- W2 H8 z, [
class CambridgeDictionaryScraper:
9 a2 E- {6 E {% b* L. g% c
""" Scraper for Cambridge Dictionary """
& {+ Z4 j) G2 @3 ?) r# m! u
url_set = set() ## Shared by all instance" N* B9 l( m, X& e
- o5 `; K2 h7 y2 F
def __init__(self):& l& n/ p% s( L$ o
for item in conn.execute("SELECT url FROM cambridge;"):
5 I: R& ]4 A& L( B/ t( H
self.url_set.add(item[0])/ K& s1 b5 C5 I8 S
print("Already Downloaded " + str(len(self.url_set)) + " Words!")8 J9 @4 ^1 C5 N% I* b+ G4 J3 c0 p, S
* s- P7 g$ x# Q* Q0 g
% W; F' C$ i3 w# K
def __del__(self):
3 F' Z- X8 X, d* C+ u
conn.commit()) q4 u; H7 F U- f* V. R! B
! k2 l, ?$ r4 [+ f3 u
9 S" r9 c3 P0 ]. }0 X
def get_word_page(self, url):
" B" j& D7 F j1 u2 d/ M( F
r = requests.get(url, headers=header)
: [: \/ i8 K$ B# V. X7 V
bs_obj = BeautifulSoup(r.text, "html.parser")1 _, h+ O- I* O
entry_tag = bs_obj.find("div", {"class": "entry"})* S& z, [ s, ?; `8 p8 E
if not entry_tag:6 h9 @- q+ T/ C! F1 j
entry_tag = bs_obj.find("div", {"class": "di-body"})
( c1 B( J, m. ]) f
if not entry_tag:
. ^5 W, B4 g* v
## Beta Words
9 y9 P! \5 G, Y; B, A, r/ n
entry_tag = bs_obj.find("div", {"id": "entryContent"})8 ^- r' E, ]$ Q. \7 J2 @
if not entry_tag:# }6 m" B9 i4 D
entry_tag = bs_obj! Q8 i5 i8 z. `/ ?; b
/ r/ x* n% |# S, m8 }
if not entry_tag:% k* ?: t% s$ A8 _. e4 r8 Q
entry_tag = bs_obj3 b* b2 M5 ?% u6 S1 M5 d/ x7 w' H. b
* d( Q; j/ m" u1 I7 z/ w0 d& I% N: c
for tag in entry_tag.find_all("script"):% |+ p( i* |0 _) B+ g
tag.extract()
' F/ Y+ C1 @+ h" N
# ~* i% u# \3 Z+ J. _
result_string = str(entry_tag)
: N1 |+ V9 q6 b. Q& g
return result_string
) B1 p. d5 Q+ K6 R+ G
# K6 K' y( l9 ^# R# ?
+ v, I/ t& u9 p) o. f
def start(self, url):* X- A' Z5 h5 d- E
r = requests.get(url, headers=header)/ @- W( g. L. a8 L
) L) E- t# S, L3 E& y. Q3 _
bs_obj = BeautifulSoup(r.text, "html.parser")
: r: k/ R6 |. a( K' i
$ u, h# e0 g. _! y/ v8 q9 ^
for li_tag in bs_obj.select("li.lpr-10"):
8 X: a5 c; u8 j
child_url = urljoin(cambridge_url, li_tag.a.attrs["href"])
7 S' w# U e: V( k8 Q1 o# t9 K
print(child_url)
3 c- u I3 s7 h. v8 |
self.find_child_entry(child_url)8 ^6 g: @! P( r, Q2 p
$ _4 `" F2 F ^+ F B4 M( C8 O
* C0 w- h- c2 g
def find_child_entry(self, url):. K4 K( o1 W& i9 r- f& ?: O5 T7 |
r = requests.get(url, headers=header) W- |5 }7 F1 Y, U3 E
bs_obj = BeautifulSoup(r.text, "html.parser")& i6 [7 {+ T6 u9 y/ [/ M) R9 e
for li_tag in bs_obj.select("li.t-i"):
child_url = urljoin(url, li_tag.a.attrs["href"]).strip()# @6 {' }# @$ ?) l+ h
child_text = li_tag.get_text().strip()
; o' L, c# c, y
if "..." in child_text:
; I8 N$ g5 f/ K, {9 y+ e2 d& U. y
self.find_child_entry(child_url)8 `! Q8 m+ r! {. y5 Z; r* R3 |
else:& x0 W4 S6 Y6 O" H$ o; G
if child_url in self.url_set:
$ }9 L) r4 J Z) W4 b3 y& |! U
continue
0 W' i3 `1 @5 Y O
print(child_text + "\t" + child_url)/ W! ] G2 z+ f |1 ?$ S
conn.execute("INSERT INTO cambridge (entry, url, html) VALUES (?, ?, ?);", ( y+ @3 L' m1 _$ v8 f" C( E" x. }
(child_text, child_url, self.get_word_page(child_url)))
4 Z' |& @, a) Z7 E
conn.commit()+ I, `5 I4 b/ I: G
self.url_set.add(child_url)% G7 O/ I, s. n
, |$ _9 e; Z" k% _9 D( B& z
4 S" b: S% s; S; n+ o
class CambridgeDictionaryExtractor():' x. B* Q0 a2 m2 _$ R
def __init__(self, entry, entry_html = ""):6 P+ L8 W6 D6 I: E
self.entry = latexify(entry)
% }7 `. X1 v* e2 u/ C
self.entry_html = entry_html6 u3 V/ y5 w! |$ e
self.result = "") Y; k+ h! W3 [, I9 ~$ ~
0 y- B! V+ y( ?& e
# def __del__(self): Z& z. `, v* F) H) ^) ~
# pass
4 M+ {" L# S" o, q# H7 ]9 h
2 t1 t: u4 C b) z0 H' F" M
% G. P% P, G% A) m4 D9 w3 e5 I
def extract(self):
) [4 ?5 L- z4 h0 p( h w- g
"""% m% U. T, v/ B6 d( e
<div class="pr idiom-block">
" ]. U( O3 E; R- C) S
<div class="idiom-block"></div>
6 |! Z& q$ f5 A8 a. a" B" H/ Z" {
</div>, ^) d1 Z: s( v# |
"""* ?1 `7 h# ?8 _- u2 A# F4 A+ M
bs_obj = BeautifulSoup(self.entry_html, "html.parser")
( Q# j" T8 P7 d! C5 ]
self.result += "\\begin{entry}{" + self.entry + "}"
a8 { w* ?( i( F. {/ E
for part_of_speech_tag in bs_obj.find_all("div", {"class": "entry-body__el"}):6 b6 _# R8 I) z
self.result += "\n\\begin{Partofspeech}\n" + self.process_part_of_speech(part_of_speech_tag) + "\n\\end{Partofspeech}"
" u9 J! J& L" F8 n
idiom_block = bs_obj.find("div", {"class": "idiom-block"})
# \8 x* M; q; r6 Y: m
if idiom_block:: {* ]( c ~% ^; {+ g$ r; S
for idiom_block in idiom_block.find_all("div", {"class": "idiom-block"}):
. _1 v/ y, G6 o# I! B$ o5 [
self.result += "\n\\begin{idiom}" + self.process_idiom_block(idiom_block) + "\n\\end{idiom}"& i! n+ O v a- A
self.result += "\n\\end{entry}\n\n"1 C3 v, s( b7 }6 ~1 `$ {; o$ g" s
% g$ }& s4 H- w
6 q7 c) o2 c3 h: n- h
def process_idiom_block(self, idiom_block):
2 w- z) I9 @6 e5 `7 N4 t
result = ""$ W/ q: c$ I$ w5 }8 E x/ x
idiom_body = idiom_block.find("span", {"class": "idiom-body"}) _5 O& H3 f3 r0 C
if idiom_body: M( F" }4 D3 h$ u
for sense_tag in idiom_body.find_all("div", {"class": "dsense"}):
: V# a2 ~2 o& W2 F
result += self.process_sense_tag(sense_tag)3 {. H3 M6 O7 f4 T% }0 w
return result1 t9 t- V) J, Z l9 F* z5 e
! i$ m/ |* P5 i/ z
; d% o' o6 k) z4 J7 R1 H0 p
; V% S# k$ g8 `4 B
def get_smart_vocabulary(self, smart_vocabulary_tag):% B7 w2 w, a; B z, A
result = ""
3 O, P9 g) U( R& T
for li_tag in smart_vocabulary_tag.find_all("li"):
) O7 W" ?. }$ f/ Q5 x
result += "\\smart{" + latexify_html(li_tag) + "}\n"3 g. E& e* E1 K8 S5 `
return "\n\\begin{smartvocabulary}\n" + result + "\\end{smartvocabulary}\n"
- L$ x$ c1 T6 V8 U
3 P! K1 b0 u% D; U1 x& k/ v
) g' r" V5 _5 b& P2 z- y% m
def process_part_of_speech(self, part_of_speech_tag):; @8 `( G5 N/ t8 Q" L* S* W
"""
; `3 p; [. v, Q9 x5 l1 t
<div class="entry-body__el">8 D9 ^. x' z7 V4 @8 r/ M& e
<div class="pos-header"></div>
' R6 i6 r3 w9 c9 u7 S1 t
<div class="pos-body"></div>
, `. v1 p) L7 f! M7 {% F" ~) d
<div class="pr relativDiv"></div>
, Z0 o' x" r9 s$ \* G7 s
<div>, a0 K. |6 O* } N4 n* h' B
"""
9 d9 [( _0 W, j9 a. {
result = ""+ h2 i2 @8 Q0 e2 u+ p( X
header_tag = part_of_speech_tag.find("div", {"class": "pos-header"})/ K+ P6 E/ H! r: ~! f1 W
if header_tag:9 }: c2 Y7 r9 ?5 x: v! q
result += self.process_part_of_speech_header(header_tag)
! c( Q5 \; m" S- C B
body_tag = part_of_speech_tag.find("div", {"class": "pos-body"})
( M, I1 e9 o' z: ^' m( k' `2 Y
if body_tag:/ E+ b* D4 V3 s8 [+ O' C# _
result += self.process_part_of_speech_body(body_tag)
& e8 n- ?( t' u6 K$ _
pv_block_tag = part_of_speech_tag.find("div", {"class": "pv-block"})
0 e( W* R0 ?3 `& v I( x! f
if pv_block_tag:) t4 X& Y9 k, c% A+ o% k1 K
result += self.process_pv_block(pv_block_tag)! u) a( l7 o4 B$ v. E" g# i" J; R
return result.strip()8 b7 z- l# W7 j \
) @7 c- X0 e: T) R+ }. G$ T' k
1 e4 Q* U. K( Z1 D! k
def process_pv_block(self, tag):* J7 _, \# e7 ] n, {
"""
3 _1 W9 b% _+ H; }
<div class="pv-block">" G/ `/ F* w) y( b7 P
<div class="di-title"></div>
* w. N% {4 q: h" O, A
<span clss="di-info"></span>
5 n% E1 k1 \5 N0 `" m; i
<span class="pv-body dpv-body">
8 s5 M2 K- n: G' Z$ n
<div class="pr dsense dsense-noh">. `! R9 ]* V3 X% K6 B2 S% w
<span>
/ Y4 r5 U. k3 b8 s$ ]1 F' K1 w
<div>
) O% s3 M. O: I
"""
1 t" S8 ^ y. j- D9 {
result = ""0 @6 q! e6 N$ ~9 f% m
for item in tag.find_all("div",{"class", "sense-body"}):
) K$ t# f2 h$ B& f
result += self.process_sense_body(item); J4 l2 Q' ?* n3 ^
return result
7 V: S7 A$ r- W0 R# }
9 |# S5 F+ @( Y' |6 i4 X
: z6 s/ R6 `' J3 O c" x0 e$ @
# n0 m( C; I8 ` G. c% Y
def process_part_of_speech_header(self, header_tag):
% G) G( V# Z l' l8 q9 m) O' d
result = ""
; v) [7 S6 J3 g2 e
# title_tag = header_tag.find("div", {"class": "di-title"})- n4 M6 J7 b* c% G! [5 N6 V
# if title_tag:- n! J+ \/ [' U7 n
# result += process_header_title(title_tag)( }: [" d8 v8 c! H8 q# `9 }! @
posgram_tag = header_tag.find("div", {"class": "posgram"})% b' f2 n( e9 z' c" _3 W, n# k: F
if posgram_tag:
' r4 E) l2 s, w: e3 a( A" b
result += self.process_part_of_speech_grammar(posgram_tag)( y$ i5 E& p$ Y# S
for pronunciation_tag in header_tag.find_all("span", {"class": "dpron-i"}):
$ Z, @9 A/ {2 o# t/ n* n9 F. K# d
result += self.process_pronunciation(pronunciation_tag)
& A3 v/ {: t. `+ z5 H' x8 |
& j- q% g9 U2 ], \6 T+ T
return result.strip()) M) i1 m. S' ] V1 Q/ H
- Q3 h0 Q8 x5 h3 r6 ?5 ^' n5 q
9 J) f. l6 p2 O, d P
def process_header_title(self, title_tag):
# N; Z6 N% i; k0 l
## <span class="hw dhw">record</span>
( m8 z: f/ S8 [3 A" d
result = "", k4 ^; Q" N* g+ c8 i
headword_tag = title_tag.find("span", {"class": "hw"})
% ~; r5 k. \5 C6 x5 t
if headword_tag:
/ s% g2 T$ y$ v( k3 U: J @0 B
result += "\\entry{" + latexify_html(headword_tag) + "}\n"1 @$ p* R' @/ Z
else:3 ?3 G, m4 j2 L7 e2 ]" n
result += "\\entry{" + latexify_html(title_tag) + "}\n" i, r( _9 c: o/ Y; L
return result! N% b# E3 D( g) i
4 A' Y6 n8 l, n
def process_part_of_speech_grammar(self, posgram_tag):4 y! [5 n2 `% ~# [& z8 B
result = ""
: Z; }- u6 l' c$ Y% f% E
part_of_speech_tag = posgram_tag.find("span", {"class": "pos"})
. L* S' [7 u2 w
if part_of_speech_tag:& U: `4 W: L0 B+ w+ q, }! w& M; U0 w
result += "\n\\pos{" + latexify_html(part_of_speech_tag) + "}". W0 P+ l3 v3 [ Q+ u' u; l
gram_tag = posgram_tag.find("span", {"class": "gc"})
% K1 [& I# Y7 _) p5 |
if gram_tag:
+ o; c3 Q) B* T6 f3 l4 F
result += "\n\\posgram{" + latexify_html(gram_tag) + "}": Q: I1 \- S- K
return result3 P9 V+ Y; L$ v, G$ {7 H+ j
% K/ d4 [& y5 Q" E$ ?
def process_pronunciation(self, pronunciation_tag):5 @% {8 x% K. Q- ~1 W3 `* x+ G
is_us_pronunciation = False
4 E& ]) L1 z' W2 z5 r
if "us" in pronunciation_tag.attrs["class"]:
8 F) }& b0 ]9 g" d% l! I$ c8 T$ O% T
is_us_pronunciation = True1 l1 D% f$ z: ]) t" x
result = ""9 \: L/ n4 D A/ H% n2 k% F
audio_tag = pronunciation_tag.find("source", {"type": "audio/mpeg"})! X3 j4 u" a2 Z9 [0 a
/ ?3 w5 ~7 R6 M( L: k9 S' O
ipa_tag = pronunciation_tag.find("span", {"class": "ipa"}), ]4 k3 O, w+ C. C o: A
if ipa_tag:
if is_us_pronunciation:
6 W: M l# B1 R2 d$ ]5 A) }' \ J
result += "\n\\ipaus{" + latexify_html(ipa_tag) + "}": v0 \5 E+ M. r }6 `% k Z
else:9 }. q' E D% {: p* ?8 v, f
result += "\n\\ipauk{" + latexify_html(ipa_tag) + "}"
4 h7 l* I+ m. ]9 B
if audio_tag:
9 N# L0 r2 n) b7 c/ O2 R8 f
audio_url = urljoin("https://dictionary.cambridge.org/", audio_tag.attrs["src"]): J4 x4 J6 m6 i% |: `1 J. \ b- z
result += "\n\pronuniation{" + audio_url + "}"2 l9 q: A% z, d0 G; h, V( w
return result
2 y% @, I8 I. ?+ ~. P0 F
; |5 [; J1 H" {8 Q1 ?0 j8 }
+ G; i' Y# @9 |6 ]
1 U5 A8 Q6 ^& {. S
def process_sense_head(self, tag):2 Q Q2 Y$ N/ \5 V7 N- q& n
text = latexify_html(tag)1 n/ F$ U) R* i9 f0 V+ @
if "(" in text:% S& `7 p' H% Y3 | G0 s. S
left_bracket_index = text.index("(")
5 ~+ c, s! m( I; n( U q
else:
0 Y3 I8 A$ v/ d0 g7 R C6 z* E( ~
left_bracket_index = 0
if ")" in text:" Q" P( x; S! f
right_bracket_index = text.index(")"), P* n1 U! f0 c1 B: G! k( n2 t
else:% a7 |2 M* z5 p0 ~6 ^) g" N
right_bracket_index = len(text)
7 `; ^3 ]3 K5 _
return "\n\\shortmeaning{" + text[left_bracket_index + 1: right_bracket_index].strip() + "}"& l7 t+ A N* U' `
- i6 q" k+ t& v! ~
" h! r* G3 S5 P% S) A7 [
def get_definition(self, tag):
3 r# m' q: e# b5 X
result = ""
! {: N! ~, d( Z7 ?$ a; W
for def_info_tag in tag.select("span.def-into"):( X( Z" N0 D4 V; ~* G5 a) b
result += latexify_html(def_info_tag)+ L, J. F# }! d7 E
for def_tag in tag.select("div.def"):
, r0 I/ U D. Z8 J* A
result += latexify_html(def_tag)
! N. r* k0 J* T: M/ k# I$ i) q
return result
6 u( U' r4 P# q' P' v( g8 Y. x w
H9 H7 j3 q' n
+ d; F8 v: B; y' \( g
def process_def_block(self, tag):6 R( y( \" p8 n N' d, R4 a5 F
result = ""
$ S: O! k5 L# C& ~
def_tag = tag.find("div", {"class": "ddef_h"})) ^6 h/ m. u }& ]8 T: d
if def_tag:% Y( l8 |: K( P
result += "\n\\meaningen{" + self.get_definition(def_tag) + "}"
6 J. O( X. i3 V1 z' b# q' @
try:7 W8 z9 z- C& R
def_trans_tag = tag.select("span.trans")[0]
, }8 c! i% b" O6 s
if def_trans_tag:
3 F" v e5 [- C; b- ]9 n
result += "\n\\meaningcn{" + latexify_html(def_trans_tag) + "}"( n/ k! `9 r. i7 h9 z7 o
except:
' q& ?/ b: P! W# v5 V" j
pass
3 E. d' j) H- A1 E* L
for example_tag in tag.select("span.eg"):
3 M/ u" X, V/ E+ N
result += "\n\\example{" + latexify_html(example_tag) + "}"; J9 }2 V9 f q1 c4 e3 S
return result
/ n$ h; `! l! U! E, `% \. g
9 N; A$ C/ ]* @0 D0 H
def process_phrase_block(self, phrase_block_tag):
9 J8 p. a/ f& M/ C
"""
$ m8 u; h3 I( K4 K6 N" O3 C& Q
<div class="phrase-head dphrase_h">...</div>) ], h; q1 G. A3 y5 B V
<div class="phrase-body dphrase_b">...</div>
+ |, J3 v/ U8 [9 k( c6 r6 d
<div class="bb hax">...</div>
7 X7 ]( x# ?: d/ v" A+ a' b
""", m' r/ m. T$ f$ _" Q/ ~& E: I
result = "\\begin{phrase}{"6 N3 t2 Y' y" l, {' Z! \5 R. v$ _, @
result += phrase_block_tag.select("span.phrase-title")[0].get_text().strip() + "}"1 K$ ?: ?: U& p8 y. E: M
return result + "\\end{pharse}\n"2 l4 M5 Y$ y; v7 P
4 Y% p) E# V; G' E- E
def process_sense_body(self, tag):
1 J* |; V. r4 S, {% Q$ S
""", L- J# `0 r# ?
<div class="pr phrase-block dphrase-block">...</div>3 U" Z. a. i4 p! w
<div class="def-block ddef_block">...</div>1 n) e8 p; w8 Z+ T) S/ z; B
<div class="bb hax">...</div>
+ g S+ J$ O9 M4 C3 |8 B* V# Z
""" b$ I' o! L7 T% H& O5 j9 q
result = ""0 z, B3 n! {$ u" e! F- ?
for def_block in tag.select("div.def-block"):) ], O4 [5 X; I- Q2 r- w+ k8 Z
result += self.process_def_block(def_block)
/ V! m* m% T0 O6 K" g) x
for phrase_block in tag.select("div.pharse-block"):; O% t% e1 d5 c* h+ j
result += self.process_phrase_block(phrase_block)* g9 T/ m! U [7 d
return result
! i9 _6 d+ g, P) H+ z: `
. Q' S# M# l. {# t7 K# S' C
def process_sense_tag(self, sense_tag):
' |. x3 l% V6 n. I
"""- G( U; K" ~% B' f) p
<h3 class="dsense_h">...</h3>' L. i3 T! ]( B3 ]* H& W
<div class="sense-body dsense_b">...</div>
8 N$ w2 j, N1 x' c. B
<div class="smartt daccord">...</div> # Smart Vocabulary
% t6 ]5 o$ ^ t1 X: g* W, {2 m
<div class="bb hax">...</div>% ]! ` |4 Z S4 \+ ^
"""
- R6 t7 h( h8 C. U+ j) Z" p
result = ""4 Y$ L' a$ c' H( r0 {3 d# m
sense_head_tag = sense_tag.find("h3", {"class": "dsense_h"})% u# R7 l; v3 \
if sense_head_tag:
7 A" e9 R/ E' K* `' [
result += self.process_sense_head(sense_head_tag)) h; E/ u' \- j# D& c
for sense_body_tag in sense_tag.select("div.sense-body"):
6 `2 l6 f* d1 W3 S
result += self.process_sense_body(sense_body_tag)+ {: a2 l }$ _( O% w* b. ^+ X
for smart_vocabulary_tag in sense_tag.find_all("div", {"class": "smartt"}):) K* l9 a0 E4 d; r+ ?
result += self.get_smart_vocabulary(smart_vocabulary_tag)! w% K* z2 ], d
. p* M4 y+ _( q2 \
return result' f V+ B2 j9 [# ~3 |3 f, H4 w
, x2 d: k! {. S# E, v7 @: [: J
def process_part_of_speech_body(self, body_tag):0 f3 H" s [) c
"""
, {# N' O6 d. j9 F
<div class="pr dsense">...</div>
0 S( P# T8 z* R6 c$ `# W! e
<div class="pr dsense">...</div>, P: _. s% I: S( ]
"""0 ~9 s8 \8 B ?: A" J
result = ""$ G: { @) S. b: C/ z$ B: I6 [: p
for sense_tag in body_tag.select("div.dsense"):
$ Y6 x& v; v( a% t+ y& e: X' f/ q
result += self.process_sense_tag(sense_tag)
* q, f. d$ ^3 a- r3 k1 i9 H" i
return result
4 e/ |/ ?* i& w4 l: Y+ {
3 E# u) g: e o) @ g& T7 y
( l8 t$ s# B8 `! z$ i$ O, |. Q
if __name__ == "__main__":8 o$ x) p0 n* F; b/ K
string = ""
- T" x6 `7 g4 ?& n2 i) l
CambridgeDictionaryScraper().start(cambridge_english_chinese_url)
! h0 n' _- n- `: A$ M: R
for row in conn.execute("SELECT entry, url, html FROM cambridge;"):
$ _$ `! B& F8 N* E2 x2 v
entry = row[0]7 N" |/ b: P9 }7 _2 `* Z
print(entry)
5 F. s8 a2 X! B& X
url = row[1]3 J- x. W2 |2 F& ?
html = row[2]1 [! r% A' H% Z
record = CambridgeDictionaryExtractor(entry, entry_html=html)
. p# c& \" }: T) F v a
record.extract(), U+ v- E6 h% B$ `
string += record.result + "\n"
3 K6 v- B9 ^: Y
#print(record.result)
: e" M+ k& O% M2 j
' Z7 _! {" ~# D5 f: J
with open("./final.tex", "w", encoding="utf-8") as f:
) s! u# }6 ^* j7 X( A8 a
try:
, B- {3 ?/ |3 {; S+ T2 r4 s" u
for char in ascii_lowercase:0 S) B, l' z, }8 D! A8 [$ O
string = string.replace("\\begin{entry}{" + char, "\\section{" + char + "}\n\n\\begin{entry}{" + char, 1): a0 S! c E/ s0 q5 Z
except:
/ p" R& J# u! d3 }: p: ~, E
pass" w$ n( P+ X4 w5 \2 p0 j! S* @
9 G. s: L M, N+ I9 G. O
f.write(string.replace("", ""))- |( ]: e9 @* v' z0 G: ]
, k- g: o( l: _" S8 T, K4 k
' p& a# j# ?! g: l( v0 R

复制代码

		自动登录	找回密码
密码			免费注册

[求助] 有没有合适的PYTHON抓字典的案例可参考？