TA的每日心情 | 开心 2019-8-21 08:44 |
---|
签到天数: 163 天 [LV.7]常住居民III
|
发表于 2018-11-16 21:27:49
|
显示全部楼层
- '''. G g- A; H+ m( V2 M, X1 n
- Based on xmllarge.py; `4 _! _1 ~5 |$ F
- '''
$ X& I* f) ^) s2 _ O# u' p6 C( O - # from pyquery import PyQuery as pq
; t3 P0 Q [9 `4 B! S - from pathlib import Path
7 T4 j: _. O& _: d8 z - & \" |- b9 p, M! ?4 F7 i7 f
, |! m1 x$ i* K! {/ d3 B+ x- def xml_iter(file, tag):
0 R( i ^- n! @ - '''
! m: `0 G, M9 q2 Q/ J1 S6 h - Process huge xml files2 s/ r/ M0 T6 x {, E# [4 v; O( h
- <tag> </tag> need to be in separate lines b) B# K V" E5 r) H) c( y4 A/ z
- # TODO: in the middle of lines
& u; |' x. I( i# @; q' q5 P - d6 `5 Q; \, s O6 c( c0 ?8 |
- :file: file path& Q7 H) o# G' a R$ l! s" q% K% k
- :tag: element to retrieve
3 u8 R6 q% l6 S - '''; E8 O. }' R+ P- z; J" Y( O
- tagb1 = '<' + tag + '>'0 d$ h9 k: |7 u( n/ l
- tagb1 = tagb1.encode()9 }. t6 t5 b" X
t/ Q% b, q, @0 N! J' v
+ b# K$ P0 H+ X. l* C+ |- tagb2 = '<' + tag + ' '
! J5 P7 T) L1 Z0 ~1 ~) H - tagb2 = tagb2.encode()
. N4 _* s/ v+ F+ T Q - $ |2 a1 j# V$ F
- tagb3 = '</' + tag + '>'9 e9 z* C \. a8 {+ z/ t* H4 w
- tagb3 = tagb3.encode()% L% i7 F" Q, u$ d5 U+ @# K0 y) ^3 S
- 8 Q9 u _" p7 e K2 V
- with open(file, 'rb') as inputfile:: T! F! U e0 U! X5 @
- append = False
, G2 X; G2 {" R - for line in inputfile:
/ @! [4 [9 i4 B3 G% K% } - #~ if b'<tu>' in line or b'<tu ' in line:
2 B, ?' O7 u- j. ]' ?3 T - if tagb1 in line:
2 n( t0 k& U( D8 O3 t - inputbuffer = line[line.index(tagb1):]
5 r3 [! R# ^' O: r% t - append = True5 C* q* h p2 u$ Y, v5 S6 [
- elif tagb2 in line:3 F/ a4 h- r m! j! s/ R
- inputbuffer = line[line.index(tagb2):]9 ]) Q- K& W+ E6 Y, _ _
- append = True
1 u- U1 q9 S3 [; a; p% n; v - #~ elif b'</tu>' in line:
" @' `* Q( M6 y( L0 w! P - elif tagb3 in line:7 [; I6 d0 T1 N# _
- inputbuffer += line[:line.index(tagb3) + len(tagb3)]# |- s6 ]% E, v/ E
- append = False
* i S, M* F+ w8 c, B, M - yield inputbuffer
. c+ W6 w0 M7 O - #~ docitem = process_buffer(inputbuffer, id_num)
; y$ G0 I. f A9 K R0 G - #~ print(id_num)
2 c1 _5 D% k7 U - #~ id_num += 1
2 Z9 e' N4 l! z) t - inputbuffer = b''
/ ]+ ^" N7 z+ j - elif append:: i5 z6 e# b& M/ Y2 W0 L
- inputbuffer += line
复制代码 : m$ a' l" G+ X9 k, m# B1 I
, I0 ^# m8 u5 O, c9 O这么多人找这东西?我过一阵打包发个小工具。6 t) F* ^& ^# r0 N$ ~# E1 d
- {! d( N1 d# C上面的python3函数用法( k. a; R6 C6 m9 q: |
resu = ''6 u8 Z f2 p/ w( Z) J2 `
for elm in xml_iter(filename, 'tu'):/ F) F0 G: ^' p' k" `( V7 E
resu += elm: e8 a) k0 o2 E% k8 ~4 p$ ?
9 N& [# g: K2 b内存足迹极小……不管文件多大。 |
|