TA的每日心情 | 开心 2019-8-21 08:44 |
---|
签到天数: 163 天 [LV.7]常住居民III
|
发表于 2018-11-16 21:27:49
|
显示全部楼层
- '''
5 d$ h4 h% |/ I p( o$ C - Based on xmllarge.py
2 c$ k2 x9 N6 t( {. F - '''. E1 C; i9 D' k) G s
- # from pyquery import PyQuery as pq
( t! G8 I% [- Q5 ^& W - from pathlib import Path4 N& x! x: G7 o: l% A
- 9 l' X/ _) i. w4 a- n# i2 a& l' ~: O
- 4 N: m. p2 K! C1 m" {
- def xml_iter(file, tag):8 [3 @2 m# f+ w! ~
- '''2 _2 x8 N0 j/ `! R* V- @
- Process huge xml files
6 o& X x+ f5 p+ `4 _: Y8 \ - <tag> </tag> need to be in separate lines
& W9 G! e( I7 A3 x) Y. T4 s - # TODO: in the middle of lines
+ z8 T5 m+ v6 t: A D2 P' @ - 9 Y. O, d. q: S+ m+ n
- :file: file path
5 {$ k# ?4 v) {- u - :tag: element to retrieve$ f1 z* P, b6 j7 i/ ~0 M
- '''
" L D" q6 f! d2 X2 U4 C - tagb1 = '<' + tag + '>'
: B/ w! t5 z8 I4 v+ ]$ q - tagb1 = tagb1.encode()* F5 L: c4 _9 t6 S5 d" |
' D/ ]7 Q& F0 _: p! T" L
2 ]1 K4 B8 s2 Q; |( ]# l0 O; t- tagb2 = '<' + tag + ' '
, |; h/ i" O! u* J. l - tagb2 = tagb2.encode()% B8 ~7 O9 p m6 f; b
- 4 R+ u" Q2 N% y2 w1 \0 h8 \# p, o4 S
- tagb3 = '</' + tag + '>'
2 }9 j+ ?$ p3 ~7 R - tagb3 = tagb3.encode()4 q# P' b, f0 M) Z5 P
- ' N$ W! T! z! L3 `9 U% T
- with open(file, 'rb') as inputfile:
j2 p1 } i( H) _ - append = False( I( v/ }& P$ M8 l
- for line in inputfile:. I. }' X- x/ _, h7 ^" m+ l
- #~ if b'<tu>' in line or b'<tu ' in line:
/ ~- R. m- i( v - if tagb1 in line:
/ P% v& X3 M8 m2 ~# h0 B! ~ - inputbuffer = line[line.index(tagb1):]* y X6 n3 R" ~
- append = True; U( k% g a. a: r+ q. E* v
- elif tagb2 in line:7 q3 H; }: B. j
- inputbuffer = line[line.index(tagb2):]6 E+ U5 }; K2 o' L X6 a) c6 l
- append = True) \3 R4 L+ n, X! Q$ I( l
- #~ elif b'</tu>' in line:* S( p& T$ [1 b; {" B3 U1 c
- elif tagb3 in line:) d+ g5 ?: D7 j
- inputbuffer += line[:line.index(tagb3) + len(tagb3)]
& P% z& A5 p7 p3 r0 h, D! ? - append = False, x( `# N. @& z/ G, G4 k0 I
- yield inputbuffer
+ d- h3 b' N# t6 x9 s( P+ T/ ? - #~ docitem = process_buffer(inputbuffer, id_num)* a4 ~$ ]# V q. E$ J5 G% j
- #~ print(id_num)
0 G6 K; G/ N" F) U5 P1 s - #~ id_num += 18 {( z& M; Q& X- h4 ]8 f& k
- inputbuffer = b''/ K2 x9 ?8 ~" b5 A2 O: \
- elif append:
4 g/ }* ]/ @* ]. T1 C - inputbuffer += line
复制代码
5 q6 i* S8 n! f
5 M+ e& `# [8 w这么多人找这东西?我过一阵打包发个小工具。5 |) E5 b H' G0 }8 ~7 R- |
; E5 S) e6 e8 z4 ]
上面的python3函数用法
" Z0 N+ D7 [+ p: i0 Nresu = ''! A. m* V, c# g" h" W% |6 D
for elm in xml_iter(filename, 'tu'):+ `7 Z9 ?7 c; p5 \- K1 m
resu += elm* h; { } y5 H
( m* ~# j3 [' W8 E' ~: z内存足迹极小……不管文件多大。 |
|