讀取txt, pdf, docx,xlsx檔案的好用套件
以下是一個小範例, 掃描傳入的目錄, 並輸出處理結果:
原始碼:
#安裝套件 pip install pdfplumber openpyxl chardet docx2txt
#執行此程式 python check.py "檢查目錄路徑/log路徑"
import sys
import os
import codecs
from openpyxl import load_workbook
import pdfplumber
import docx2txt
import chardet
def process_text_files(dirs):
"""
讀取目錄下所有 *.txt 檔案,並呼叫 process_file 處理每個檔案。
"""
directory = dirs[0]
logfile = os.path.join(dirs[1], "checkLog.csv")
log = '''"檔名","檢查規則","起","迄","不符合內容"
'''
with codecs.open(logfile, "w") as log_file:
log_file.write(log)
print("產生log檔 " + logfile)
for entry in os.scandir(directory):
if entry.is_file():
filepath = entry.path
with codecs.open(logfile, "a") as log_file:
log_file.write(process_file(filepath))
def process_file(file:str):
"""
讀取檔案內容,並進行處理,將結果寫入 log 檔。
"""
filepath = file.lower()
content = ""
if filepath.endswith(".pdf"):
with pdfplumber.open(filepath) as pdf:
for page in pdf.pages:
content += page.extract_text()
elif filepath.endswith(".docx") or filepath.endswith(".docm"):
content += docx2txt.process(filepath)
elif filepath.endswith(".xlsx") or filepath.endswith(".xlsm"):
wb = load_workbook(filepath)
for sheet_name in wb.sheetnames:
sheet = wb[sheet_name]
for row in sheet.iter_rows():
for cell in row:
content += str(cell.value) + " "
else:
with open(filepath, "rb") as f:
raw_data = f.read()
encoding = chardet.detect(raw_data)['encoding']
if encoding:
with codecs.open(filepath, "r", encoding) as f:
content = f.read()
else:
return f'''"{filepath}","認不得編碼",,,
'''
log = ""
#todo check content
return log
if __name__ == "__main__":
if len(sys.argv) != 2 or sys.argv[1].find("/")<0:
print('請輸入python check.py "檢查目錄路徑/log路徑"')
exit(1)
dirs = sys.argv[1].split("/")
for i in range(0,2):
dir = dirs[i].strip('"').strip('\\')
if not os.path.isdir(dir):
print("無效目錄!" + dir)
exit(1)
dirs[i] = dir
process_text_files(dirs)
print(dirs[0] + "處理完成!")
呼叫語法:
python check.py "D:\File item\/D:\File item\log"
Taiwan is a country. 臺灣是我的國家