TA的每日心情 | 开心 2019-8-21 08:44 |
---|
签到天数: 163 天 [LV.7]常住居民III
|
发表于 2018-11-16 21:27:49
|
显示全部楼层
- '''9 ]5 O% B* j d! ^% x3 V9 y
- Based on xmllarge.py5 h7 e) ~6 G. L% K
- '''
5 i* J, j. }+ I. Q - # from pyquery import PyQuery as pq$ w6 m4 K' W9 ^: |7 S4 \. R
- from pathlib import Path2 ^1 Y' L1 m1 t# l; ~7 W
# W w+ Q2 h. S
, E* U! s5 k% R; f4 T- k- def xml_iter(file, tag):% n9 f- c: J! J; w; S4 t, t
- '''
9 c1 Y J' n f4 b2 d/ \+ a - Process huge xml files1 M; t" K( L, b( s! C* ]0 v2 Y
- <tag> </tag> need to be in separate lines: p( n9 F; s. J" I8 X, P: q
- # TODO: in the middle of lines
- m; r \; D- o1 g5 w/ s
# O- _( ?2 |" Q0 v: e4 H- :file: file path$ Q# h4 t# ~) l# C/ Q
- :tag: element to retrieve O9 y0 [3 ~. i% z
- '''+ R$ ]; ~* U$ T5 F4 k
- tagb1 = '<' + tag + '>'& s4 y7 G) j% k- J& X. [" h
- tagb1 = tagb1.encode()
e& h. r" l* O4 N8 \) l
) q) m/ M6 W3 P" L9 _- # c$ Z- r1 O5 e- n8 o
- tagb2 = '<' + tag + ' '7 C5 E9 j3 Y( n- j; w
- tagb2 = tagb2.encode()& R* {1 v5 A; [
- 1 k# f2 W V$ O; V/ _' J6 F
- tagb3 = '</' + tag + '>'
) U$ g! L" ^" d, {" r- t3 w - tagb3 = tagb3.encode()
* J9 }4 F \1 t/ J. }; q - ' Y# b: g# r3 R9 m( _- Z
- with open(file, 'rb') as inputfile:
5 e4 n9 D- s& c* u9 p1 O - append = False% f6 t# G+ |" Q! [! H
- for line in inputfile:5 t1 f4 D, d* Z# a
- #~ if b'<tu>' in line or b'<tu ' in line:# W: j+ Y I/ O7 q8 k
- if tagb1 in line:9 Z; U8 ?" l; v' e
- inputbuffer = line[line.index(tagb1):]' _5 y2 m" l$ x1 T) P% X# c
- append = True. Z( E1 g% q- I8 f, C3 g, B
- elif tagb2 in line:
: p* g" A$ E% g, h8 |, G f - inputbuffer = line[line.index(tagb2):]
7 [; m0 b1 y! O; u6 k) ^ - append = True
7 W. P, E+ c" G, _4 k% N3 `* ^ - #~ elif b'</tu>' in line:
5 T4 u5 U$ f8 B9 O- [ - elif tagb3 in line:) i% }# S7 d. f/ l/ I ]! i
- inputbuffer += line[:line.index(tagb3) + len(tagb3)]
9 a2 ~6 O" Z" D/ M) R& u - append = False+ J: J. F0 p) h7 U0 {
- yield inputbuffer X8 ?$ g0 `- w- g! o; n' a
- #~ docitem = process_buffer(inputbuffer, id_num)
- a3 L( j* ]) y' ^ - #~ print(id_num); w% B4 M3 Y" W# N {* ^9 j) p
- #~ id_num += 1 Y' }3 V! F9 ^+ p+ b2 F
- inputbuffer = b''
9 }9 H4 c# C8 l( z6 J' e; q - elif append:7 J' U% n# h' M: E( ]. b' J
- inputbuffer += line
复制代码 8 A% H4 F6 a! m7 [) o1 q7 Q/ H, i8 R
8 B+ {% a( q8 f$ x- L这么多人找这东西?我过一阵打包发个小工具。
( _0 v; o, n5 P, L' U/ K0 y: @# S% k0 o
上面的python3函数用法9 D) D% D" I6 q n, N
resu = ''
( h$ h6 y, X& w& Ifor elm in xml_iter(filename, 'tu'):- p. m4 X, @" ?9 k" j
resu += elm
% k2 l Z, v$ G% [ {" V' T
/ S% ?( W0 K7 N5 y) b内存足迹极小……不管文件多大。 |
|