|
发表于 2024-1-14 02:18:24
|
显示全部楼层
查重并写进csv
- from docx import Document
- import os
- import os.path
- import difflib
- import csv
- path = ".\\files"
- text = dict();
- for parent,dirnames,filenames in os.walk(path):
- for filename in filenames: #输出文件信息
- document = Document(path+"\"+ filename)
- print(path+"\"+ filename)
- text[filename] = ""
- tables = document.tables
- for table in tables:
- # 读取表格行
- for row in table.rows:
- # 读取每行的单元格数据
- for cell in row.cells:
- # 打印单元格内容
- #print(cell.text)
- text[filename] = text[filename]+ cell.text
- pass
- text[filename] = text[filename].replace("\n","")
- text[filename] = text[filename].replace(" ","")
- text[filename] = text[filename].replace('\xa0','')
- text[filename] = text[filename].replace( '\u2716','')
- file = open('output.csv', mode='w', newline='', encoding='utf-8')
- for key1 in text.keys():
- print(key1+"...")
- for key2 in text.keys():
- if key1 < key2:
- similarity = difflib.SequenceMatcher(None, text[key1], text[key2]).ratio()
- writer = csv.writer(file)
- writer.writerow([key1,key2,similarity])
复制代码 |
|