TA的每日心情 | 擦汗 2021-11-17 09:18 |
---|
签到天数: 79 天 [LV.6]常住居民II
|
发表于 2021-10-7 07:59:27
|
显示全部楼层
( @- p# O) Z Z& \5 y$ S9 r
好的!! 麻烦了6 P3 k" W) M9 M y) X8 D
0 G; r/ S" @1 \! c T. F7 x6 f- import os
3 N6 B4 @ t. |' `6 e( f5 l6 A - import glob
+ I) D' D: W5 T, d0 p+ Y - import re9 d* {# |/ H( m7 Z3 E. \! D4 T
- import time. H- ~% ~6 @# c" a! A
( I7 z, p- w3 J$ P' [6 E- #遍历所有子文件 \' l; X( k- J" z4 u* A8 |
- def getFileListAll(filePath):
* T0 x' i" s% r - filelist=[]
2 l7 U6 d9 U1 x$ Z( p7 D7 F# C0 S - for root, dirnames, filenames in os.walk(filePath):' G/ Z" m4 C3 V" Y- t' \. \% N
- for filename in filenames:
6 w6 Z7 f( `/ t- J ` - filelist.append(os.path.join(root,filename))( q. ~* P* L/ Y* D/ ?2 |( J$ w: _
- #print(os.path.join(root,filename))5 ]: F& |, S9 Z5 e* u- X
- return filelist
5 r6 v$ e8 l5 {2 g
% z% Z9 B/ D, `) z) B1 D8 x8 v+ N* K- #判断是否为txt文档; _8 y6 R6 ~& G1 o( g& Q' q1 ?
- def isTxts(nameList):8 w) ]* Y5 a) Y' s- D# Y
- pattern=r"^[^~$].+\.(txt)$" 7 ]6 N" W* }' k9 p; o
- patternObj=re.compile(pattern,re.I)$ ^/ e4 G7 @: O! y) U/ ]
- notTxtList=[]
5 F1 n, P& _; U( T+ O% \ - for fileName in nameList: 4 U' ^; G& Y/ h; P6 \
- if not patternObj.fullmatch(fileName): #判断是否为txt文件 + t- f" B, x; c% q. `* F
- notTxtList.append(nameList)& Z3 i+ ` G Y
- if notTxtList:" Q7 }4 z+ D2 @% p3 e5 ]
- print("存在非txt文件:{0:},请处理后再运行程序!".format(notTxtList))+ W- b w8 v! C9 j3 L t! R
- return False0 k$ ?$ R1 J/ h) }( {- n
- else:
4 Q: D/ v3 E9 c* h! O' g - return True 4 V& A( h! w$ a# i$ p) e
! P6 i2 p/ N" h* d* {, q$ \- #在指定的txt文档中查找关键字) r" ~! Y% E v/ a2 P$ O/ G/ y
- def readTxt(fileName):#strkey:查找的关键字;fileName:文件路径
' |4 H# i3 Y+ x% |' P8 W - pattern=r"^[^~$].+\.(txt)$"
/ h2 v+ `( I4 Y- ?% Q: `4 x3 t - patternObj=re.compile(pattern,re.I)
) ]- {* l, Z: [; K+ | - if patternObj.fullmatch(fileName): #判断是否为txt文件 . W, s) [- p* U5 H" F! H! x' p
- try:
& X8 g! D9 C, o) \6 o1 d - f=open(fileName,"r",encoding="utf-8")/ s: l! W: l% I5 v9 ]% k
- txt=f.read()
+ t0 R, U* f% U0 ]4 M - f.close() 0 o# m$ a9 t% [+ l' Z/ k+ x; v
- except:" ^( U: [5 m, w5 F& N9 `3 [
- print("读取文档失败:{0:}".format(fileName))
6 z& |: N, b' C1 U3 `+ w" D- b6 b8 ? - return "" ( a8 I" G* t0 u
- else: #无异常时,执行: y9 j3 h0 m n! I- F2 p
- return txt- `2 ^; a+ ^$ x3 r
- #finally: #无论是否异常,都执行0 B7 k+ O( s: n6 n* h" H
- {' n" H( N/ X$ c; i; N X- def writeTxt(txt,outPath):/ l/ v4 ^1 ^9 i e
- path="\".join(outPath.split("\")[0:-1])$ J% v F% e3 d2 i; p/ i+ Y
- if not os.path.exists(path):* W8 Z2 J3 T1 F% ~+ D8 ?- {
- os.mkdir(path)
7 g6 P' h+ f6 W( ] @ - try:
5 Q) j% T- Y/ Y+ t. w9 P - f=open(outPath,"a",encoding="utf-8")
9 w6 K$ B' t0 K5 `1 j3 Y7 r - f.write(txt)- j$ }1 q: o% l5 f' e+ m
- f.close()' b( Z5 I. V! ? k8 V
- return True
" s) K& U4 T7 M* G - except:
9 B6 g' A2 h0 S) _0 O- H - print("写入文档失败:{0:}".format(outPath))3 s r. q A+ p# |8 j: p
- return False: `8 \9 X, m5 S% m4 d, m
- #else: #无异常时,执行, x: r7 _! ]; n5 [; F
- # return txt% W* S% k. g3 E0 T
- #finally: #无论是否异常,都执行
$ h2 T( b+ m$ o3 @
* O, Y/ V) B+ ~# J* F- def getfileName(fileList):
9 _9 W% {+ T* P2 v u8 X - '''& F1 M% k% p! ]' @ U
- :param fileList:文件路径列表( Y% ^) Z. j- [) L3 z1 `
- :return nameList:文件名称列表有扩展名# O0 {1 H4 S5 N9 s
- '''6 S: d' P7 Z8 _7 A. `% b
- nameList=[], h! {8 t$ Q0 i# }
- for fileName in fileList:
( O d* ?+ E# }3 Y3 M% ~9 s1 u1 c - name=fileName.split("\")[-1]#提取文件名* r3 T+ L, f+ h* Y0 R% J, Y: [
- nameList.append(name)
; M( Q1 W$ Q& N- o* A - return nameList' F6 r$ {2 V9 g9 V! _- R
; o3 s: V3 F) R- #显示重复的文件名,如果有重复文件显示重复的文件名称,并返回False,否则返回True
( I5 G9 o3 t5 Z* U( z9 h0 Q4 I - def showDupFile(nameList):& V8 w: F0 e, E: X G0 V- `4 \2 v
- '''
9 O9 F9 c% X; G$ w1 D - 判断是否有文件名相同的文件# c8 I" D+ p' }
- :param fileList:文件名列表,包含绝对路径! U! v/ Y6 K; E) j. }$ g) v
- :return: 如果有重复文件显示重复的文件名称,并返回False,否则返回True* a4 ~& \! Z* T% `; O- v# J3 v/ a
- '''
, ?- E$ O! F3 w+ Z - if nameList: #如果文件名列表不为空- F9 T6 `3 _- N# I- n7 P
- nameSet=set(nameList)- g: [$ n8 ^" o) n
- DupNameList=[]
# e; S- W g. O5 O0 n4 ~( X7 G - for item in nameSet:+ \9 X: C9 l; U- A$ c7 ~
- if nameList.count(item)>1:( v- D+ u- P# r/ l! {% O4 y
- DupNameList.append(item)
% w+ B7 ]1 O. B: M! R& {, {, ` - if DupNameList:#保存重复文件名的列表
% T, y: B% H3 y1 [2 b2 j, h) V8 B3 P - for L in DupNameList:
6 n6 V% b7 W/ b7 d( C* ? - print("{0:}为重复的文件,请处理!".format(L)) - c8 q+ c) B, Y! w: y
- return False
% D+ d2 T5 h0 h) m - else:! p2 d; ^7 ~( h4 z8 W9 z9 o
- print("没有发现文件名相同的文件,处理中...")
% O e+ J' f! Y @ - return True! @5 {' u8 C7 F ?" k) {
4 e/ ?% f" S2 P- def nameListSort(nameList,rev=True):
7 g9 Z9 }5 w$ r' r+ o* E8 } - '''
% g! W3 z+ k0 E& i8 X: t+ [9 t - 按照字符串长度排序8 \3 {% P' `/ I0 l" j
- :param fileList:文件名列表. C( f# x/ \, `# e# [: y0 ]) b
- :param reverse:默认降序; U5 t' v4 ]8 _2 J8 `: Q+ m
- :return newNameList:返回排序后的文件名新列表) U; \1 I& I/ H' ^
- '''
; w2 Q, G6 j; p1 |- T+ ?3 Z - newNameList = sorted(nameList,key = lambda i:len(i),reverse=rev) #按照字符串长度排序,降序- B9 s# I0 F3 T. _* g
- return newNameList
5 L$ m0 _9 E; u J, v) {9 e - - c/ h5 H5 R: ?* E; P& r' y
- #添加超链接
, H- M( D8 U# c' p - def formatTxtHref(nameList,txt):
. h( ^ b g) e$ [ - '''
1 v5 R4 ^* E* q& s3 H! _+ R - func:在文本中为特定字符串添加超链接/ h% J. I& o2 ~( m5 p
- :param hrefStr:待添加超链接的字符串
/ N/ D" e! O7 i& s - :param txt:文本字符串
6 g2 c6 e) I( X - :return txt:格式化的文本字符串
, b8 O* u4 Q+ @6 s" ]5 ^1 W - ''': ]/ {/ q) H! s+ s8 c
- print("\n开始循环添加超链接关键词辅助标记:")+ q* ~/ B+ K9 y5 O8 Z, J. d b
- count=01 k* U1 d8 n7 E# I& @
- nameListDescend=nameListSort(nameList)
( Y0 \: C( n" ^) T. H0 \; ~* y- c( m - for nameD in nameListDescend:7 Y) @/ p6 r* t4 f) z* j: `; m
- nameDSimple=nameD.replace(".txt","")
% r$ c1 ~3 z; z3 v6 B - nameDSMarked="【@"+nameDSimple+"@】"/ T7 i, D7 y! S8 T' n
- txt=txt.replace(nameDSimple,nameDSMarked)/ b7 ~5 S) ]; q5 P+ ~# ?5 _
- count=count+1
7 G( i: b5 [9 h - print("\r已完成第{0: ^6}个".format(count),end="") 7 S6 Y. h1 I0 K4 e
7 a- V; }# z6 }1 h+ H- print("\n开始循环清理嵌套的冗余的超链接关键词辅助标记:")
5 u- ?% P! u5 l- |$ Q - count=05 H0 L- b) W. D9 S+ N1 ]% K2 ~# y6 s
- nameListAscend=nameListSort(nameList,rev=False)
1 d/ Q% c9 f4 I! [* `4 D - for nameA in nameListAscend:
6 F c* X. Q3 s m1 A" p& g) O - nameA=nameA.replace(".txt","")# D/ y% z5 l' B. O+ q- f/ z" N
- pattern=r"【@([^@】]*?)【@{0:}@】".format(nameA) 6 U) R, e, ^% u
- patternObjTxt=re.compile(pattern)' n3 t3 w* S6 u7 W# Z/ z
- toHrefStr="【@"+"\\1"+"muyubug"+nameA3 B- j* B0 w/ d1 I; I9 o
- txt=patternObjTxt.sub(toHrefStr,txt,count=0) #替换为的字符串
}1 R' N9 F' Z: M& `. I -
) }# F2 S. F( s- H - pattern=r"【@{0:}@】([^【@]*?)@】".format(nameA)
; j, \2 o; [2 l. ` - patternObjTxt=re.compile(pattern): ?; |' m* ~6 k5 ?
- toHrefStr=nameA+"\\1"+"muyubug"+"@】"
3 @0 B/ ? X( @" ^$ O+ g' L - txt=patternObjTxt.sub(toHrefStr,txt,count=0) #替换为的字符串
- H! |5 c) S. S1 z1 K2 e6 X - txt=txt.replace("muyubug","")
7 `- `6 M, ?* a% K - count=count+1
0 y: `( f- I0 I6 [0 Q4 R - print("\r已完成第{0: ^6}个".format(count),end=""), |4 e7 Q, Q& \ ^( X! e/ s
- & ]/ {1 r- J2 @) W; b6 O4 q
- print("\n开始添加超链接,") ! a+ } L. U, w
- pattern=r"【@([^【@]+?)@】"
5 _/ V0 B k" r - patternObjTxt=re.compile(pattern)
$ N/ a! v1 v+ s( s1 T5 Q v+ ^ - toHrefStr=r"<a href='entry://\1'>\1</a>"
, C1 Q; p, I0 k% x - txt=patternObjTxt.sub(toHrefStr,txt,count=0) #替换为的字符串
7 R* c) m0 ]; L+ N( R" p& i -
1 N9 O( Y& W: q - print("开始清理辅助标记,")
# n- z: C9 i5 I, d" @ - txt=txt.replace("<@>","")( J, R* A$ ?& C) i" f# \) v
- return txt
) b& U* T0 m3 e- ?; ]
$ `5 c: b: n% S$ |' A2 S# F+ a, Z0 E- def formatText(fileName):
& r, `4 r( |: C- ~1 ? - '''( K& o: n# A- J m" T( q- N: k* G
- func:清洗字符串,格式化
- o+ L A/ Y& |6 U! s( Z - :param fileName:待处理文件, k, Q" y# c q; x/ q
- return txt:处理后的字符串. O6 l0 A+ [; ?4 j
- '''& E$ z& P: W. u$ ^7 W, ^3 f ~' Z
- #提取文件名
- ~/ X/ [, g' a. i& }: } - name=fileName.split("\")[-1].replace(".txt","")$ a! m! V/ c( `0 f* H L
- #在文件名的每个字符键添加标记符
7 j5 ]! n5 `$ Z - pattern = re.compile('.{1,1}')
4 A9 D# k/ e: ]+ r0 L n2 Z - matchCharList=pattern.findall(name)4 `2 X6 m. q) |/ \6 `% R4 X' B
- nameMaked="<@>".join(matchCharList)# k, a* ~& |- f0 o
- #生成标题部分
- v3 ?5 W1 O- |9 R' K3 G - txtTitle=nameMaked+"\n", H0 J" C, i; A* i3 k, U# H+ [
- #读入文件5 b0 c/ R2 n [' j4 C. B% _8 L/ x
- txtContent=readTxt(fileName)
- [% B( Y7 A, Y7 b+ G% B6 A* a - #替换正文中的特殊字符) l3 j, n! B! ]* }: S
- strList=["\n","<br>"] + K+ ~8 d. s7 U1 o1 b* a+ ]
- for ch in strList:
0 W$ ]. }) t( S: i - txtContent=txtContent.replace(ch,"<BR>")
8 M* C( ]+ \: Y - #合成整个词条的内容
3 w4 H1 J3 M# ]' ]8 R; g1 k - txt=txtTitle+txtContent+"<BR>\n</>"0 I7 c4 d4 {* c+ U _+ v
- return txt5 F. G" x; t( D
% _ ?. x7 I2 M- def mdxFormat(path,outputPath):) y( J& \& d2 @5 z+ V U( [
- '''
8 ^$ y$ F8 t# z' } - func:格式化文本
, N: q) j: T8 z# @4 H - :param path:待格式化文本的路径
# l) @* n& f" S; |" V1 r- @ - :param outputPath:输出的路径和文件名- B+ {6 q: m; u, n- F \* T) t0 B
- '''9 @& g: w- K- r) Z+ V9 _5 Z, k
- fileList=getFileListAll(path)#获取指定目录下的全部文件,包括子目录中的文件
" i; p/ W3 I7 {. D+ | - nameList=getfileName(fileList)
/ r1 X* Z7 \% b/ F4 V2 q - if isTxts(nameList):
% O+ M W6 T. R' N - if showDupFile(nameList):#如果没有重复文件/ p4 b0 ]; ?8 ?0 B
- print("找到了{0:}个txt文档,".format(len(nameList))). J* J% i& E8 B7 ~7 R- V
- + O; v1 m$ w5 e* w
- print("开始格式化、合并文档:")
/ e( H' d2 }" v* x3 S - count=0#txt文档数量统计
, H( u. F% f6 _! H2 {- ^% {) v B - txtList=[]#存储文件内容
- W* W: T# B" L - for fileName in fileList:
& F! u/ q/ i$ j+ U - txt=formatText(fileName) #转换换行和<br>为<BR> 2 m. M) ]% Z6 d- d6 \" W( I
- txtList.append(txt)# O U( @: d; n' ^0 g6 j {$ f- U
- count+=1' ]1 m; C+ g0 c9 ?+ c
- print("\r已完成第{0: ^6}个".format(count),end="") ' X7 U; F9 g: \' s' j1 |
- txt="\n".join(txtList)' Q! ~# U: R& C. C0 M
-
h; E! z' s; d - #添加超链接- i; ~. ~* U2 I4 t/ r0 O3 h5 d
- txt=formatTxtHref(nameList,txt)
3 v- l( p2 i4 B - print("开始写入文本,"). |( M( X( J( e; L- n& @. z
- if writeTxt(txt,outputPath):" D- {7 P8 g8 j$ m+ R
- print("文件合并输出成功!"): L5 F9 o" V" H9 J* p7 p0 M
- else:
0 |6 Y- R! s5 t. m1 V( B - print("Error:Merge!")
' d& C5 t4 b* }* N" u/ v, J8 n7 D0 } - return 7 v, N2 g, L( E6 W% U& L' }
-
# Q4 f* ]! s# p$ D8 _: s - def main():
* R& u/ j* B" H) q# p) U9 ]; \ - timeStart = time.time()
+ t( o% O2 i# g( [& W - 4 O' ]6 S3 D+ k; P# D1 B
% w, P6 P f% t' C6 `* _- 5 d% J$ e# A6 o a' d# t
- path=r"/Users/vivian/Downloads/mdxFormat_upload_V3.5/test"0 x# @# U- U0 g% K' a6 ~
-
) ~3 ^3 h, W% [$ K5 K- u - outputPath=r"/Users/vivian/Downloads/mdxFormat_upload_V3.5/output/demo.txt"
( t3 {5 I8 n7 ` - . u) D* H& M3 R' i7 J0 k* ~
- mdxFormat(path,outputPath)
4 Z% d5 F3 R+ d
: t( \4 k( e" [+ P$ B- timeEnd = time.time()
2 X2 r: R! L0 P. ?& ?3 P! ^ - print("程序运行了%d秒"%(timeEnd-timeStart))) U" J9 P/ ]. E. L n
- A) \1 ~. ?& o/ L9 ?) W- if __name__ == '__main__':
& n8 P1 E% u5 e/ x: x( x - main(): N0 \" s/ _' q% e
复制代码 |
|