[Python] 讀取各種檔案

2024-09-06

38
0
Python

讀取txt, pdf, docx,xlsx檔案的好用套件

以下是一個小範例, 掃描傳入的目錄, 並輸出處理結果:
原始碼:

#安裝套件 pip install pdfplumber openpyxl chardet docx2txt
#執行此程式 python check.py "檢查目錄路徑/log路徑"
import sys
import os
import codecs
from openpyxl import load_workbook
import pdfplumber
import docx2txt
import chardet

def process_text_files(dirs):
  """
  讀取目錄下所有 *.txt 檔案，並呼叫 process_file 處理每個檔案。
  """
  directory = dirs[0]
  logfile = os.path.join(dirs[1], "checkLog.csv")
  log = '''"檔名","檢查規則","起","迄","不符合內容"
'''
  with codecs.open(logfile, "w") as log_file:
      log_file.write(log)
  print("產生log檔 " + logfile)
  for entry in os.scandir(directory):
    if entry.is_file():
      filepath = entry.path
      with codecs.open(logfile, "a") as log_file:
        log_file.write(process_file(filepath))

def process_file(file:str):
  """
  讀取檔案內容，並進行處理，將結果寫入 log 檔。
  """
  filepath = file.lower()
  content = ""
  if filepath.endswith(".pdf"):
    with pdfplumber.open(filepath) as pdf:
        for page in pdf.pages:
            content += page.extract_text()
  elif filepath.endswith(".docx") or filepath.endswith(".docm"):
      content += docx2txt.process(filepath)
  elif filepath.endswith(".xlsx") or filepath.endswith(".xlsm"):
    wb = load_workbook(filepath)
    for sheet_name in wb.sheetnames:
      sheet = wb[sheet_name]
      for row in sheet.iter_rows():
        for cell in row:
          content += str(cell.value) + "  " 
  else:
    with open(filepath, "rb") as f:
      raw_data = f.read()
      encoding = chardet.detect(raw_data)['encoding']
      if encoding:
        with codecs.open(filepath, "r", encoding) as f:
          content = f.read()
      else:
        return f'''"{filepath}","認不得編碼",,,
'''
  log = ""
  #todo check content
  return log

if __name__ == "__main__":
  if len(sys.argv) != 2 or sys.argv[1].find("/")<0:
    print('請輸入python check.py "檢查目錄路徑/log路徑"')
    exit(1)

  dirs = sys.argv[1].split("/")
  for i in range(0,2):    
    dir = dirs[i].strip('"').strip('\\')
    if not os.path.isdir(dir):
      print("無效目錄！" + dir)
      exit(1)
    dirs[i] = dir

  process_text_files(dirs)
  print(dirs[0] + "處理完成！")

呼叫語法:

python check.py "D:\File item\/D:\File item\log"

Taiwan is a country. 臺灣是我的國家

Python

回首頁

愛比的新手筆記

從推翻自己所寫的開始

[Python] 讀取各種檔案

標籤雲

系列文章