138 lines
6.3 KiB
Python
138 lines
6.3 KiB
Python
# -*- coding: utf-8 -*-
|
|
# Required libraries: paddleocr, paddlepaddle (or paddlepaddle-gpu)
|
|
# Install using: pip install paddlepaddle paddleocr
|
|
import os
|
|
import time
|
|
from paddleocr import PaddleOCR # Import PaddleOCR class
|
|
import logging
|
|
|
|
# Suppress excessive PaddleOCR logging if desired
|
|
logging.disable(logging.INFO) # Disables INFO level logs, keeps WARNING and ERROR
|
|
# logging.disable(logging.WARNING) # To disable WARNING level logs as well
|
|
|
|
# --- Configuration ---
|
|
BASE_IMAGE_DIR = '/Users/zpc01/workspace/zzlh/compliance/assets/images/dms1'
|
|
# Language configuration: 'ch' supports Chinese and English by default.
|
|
# Other options include 'en', 'korean', 'japan', 'french', 'german', etc.
|
|
LANG = 'ch'
|
|
USE_ANGLE_CLS = True # Use angle classification to help with rotated text
|
|
# PaddleOCR attempts to use GPU automatically if paddlepaddle-gpu is installed
|
|
# and CUDA is available. You can force CPU with use_gpu=False.
|
|
# USE_GPU = False # Uncomment to force CPU usage
|
|
|
|
# --- Initialize PaddleOCR Engine ---
|
|
# This needs to run only once to download and load models into memory.
|
|
print(f"正在初始化 PaddleOCR (语言: {LANG})... 这可能需要一些时间,特别是首次运行时。")
|
|
try:
|
|
# Initialize PaddleOCR
|
|
# If you want to explicitly control GPU usage, add use_gpu=USE_GPU parameter
|
|
ocr_engine = PaddleOCR(use_angle_cls=USE_ANGLE_CLS, lang=LANG) # Default: use_gpu=True (auto-detect)
|
|
print("PaddleOCR 初始化成功。")
|
|
except Exception as e:
|
|
print(f"初始化 PaddleOCR 时出错: {e}")
|
|
print("请确保已正确安装 paddleocr 和 paddlepaddle (或 paddlepaddle-gpu)。")
|
|
print("GPU 用户请确保 CUDA/cuDNN 环境配置正确。")
|
|
exit() # Exit if initialization fails
|
|
|
|
# --- Iterate through base directory's subfolders ---
|
|
print(f"开始处理目录: {BASE_IMAGE_DIR}")
|
|
|
|
if not os.path.isdir(BASE_IMAGE_DIR):
|
|
print(f"错误:基础目录 '{BASE_IMAGE_DIR}' 不存在或不是一个有效的目录。")
|
|
exit()
|
|
|
|
try:
|
|
subfolders = [f for f in os.listdir(BASE_IMAGE_DIR) if os.path.isdir(os.path.join(BASE_IMAGE_DIR, f))]
|
|
except OSError as e:
|
|
print(f"错误:无法访问目录 '{BASE_IMAGE_DIR}'。请检查权限。 {e}")
|
|
exit()
|
|
|
|
if not subfolders:
|
|
print(f"在 '{BASE_IMAGE_DIR}' 下没有找到任何子文件夹。")
|
|
exit()
|
|
|
|
print(f"找到 {len(subfolders)} 个子文件夹,将逐一处理...")
|
|
|
|
# --- Process each subfolder ---
|
|
for folder_name in subfolders:
|
|
subdir_path = os.path.join(BASE_IMAGE_DIR, folder_name)
|
|
print(f"\n--- 正在处理文件夹: {folder_name} ---")
|
|
|
|
all_texts_in_folder = [] # Store all recognized text from this folder
|
|
png_filenames_sorted = [] # Store sorted PNG filenames
|
|
|
|
# --- Find and sort PNG files in the current subdirectory ---
|
|
try:
|
|
all_items_in_subdir = os.listdir(subdir_path)
|
|
# Filter for PNG files only and sort them
|
|
png_filenames = [
|
|
f for f in all_items_in_subdir
|
|
if f.lower().endswith('.png') and os.path.isfile(os.path.join(subdir_path, f))
|
|
]
|
|
png_filenames.sort() # Sort alphabetically (dictionary order)
|
|
png_filenames_sorted = png_filenames
|
|
|
|
if not png_filenames_sorted:
|
|
print(f" 在文件夹 '{folder_name}' 中未找到 PNG 图片。")
|
|
continue # Skip to the next folder
|
|
|
|
except OSError as e:
|
|
print(f" 错误:无法访问子文件夹 '{subdir_path}' 或读取其内容。跳过此文件夹。 {e}")
|
|
continue # Skip to the next folder
|
|
|
|
print(f" 找到 {len(png_filenames_sorted)} 个 PNG 文件,将按字典序处理...")
|
|
|
|
# --- Process PNG files in sorted order ---
|
|
for filename in png_filenames_sorted:
|
|
image_path = os.path.join(subdir_path, filename)
|
|
print(f" 正在识别图片 (顺序: {png_filenames_sorted.index(filename) + 1}/{len(png_filenames_sorted)}): {filename} ...")
|
|
|
|
try:
|
|
start_time = time.time()
|
|
# Perform OCR using PaddleOCR
|
|
# The result is typically a list where each item corresponds to a detected text line.
|
|
# Format: [[[box], (text, confidence)], [[box], (text, confidence)], ...]
|
|
# Sometimes it might be nested further, e.g., [page_result]
|
|
# We usually need result[0] for a single image.
|
|
result = ocr_engine.ocr(image_path, cls=USE_ANGLE_CLS)
|
|
end_time = time.time()
|
|
|
|
# Extract text from the result structure
|
|
extracted_texts = []
|
|
if result and result[0]: # Check if result is not None and the first element (page/image result) exists
|
|
for line in result[0]: # Iterate through detected lines in the first (and likely only) page/image
|
|
# line format is usually [[box_coords], (text, confidence)]
|
|
if line and len(line) == 2 and isinstance(line[1], tuple) and len(line[1]) >= 1:
|
|
extracted_texts.append(line[1][0]) # Get the text part
|
|
|
|
if extracted_texts:
|
|
all_texts_in_folder.extend(extracted_texts)
|
|
print(f" 识别到 {len(extracted_texts)} 段文本,耗时: {end_time - start_time:.2f} 秒")
|
|
else:
|
|
print(" 未识别到文本。")
|
|
|
|
except Exception as e:
|
|
# Catch potential errors during OCR processing for a single file
|
|
print(f" 处理图片 '{filename}' 时发生错误: {e}")
|
|
# Decide whether to stop or continue; here we continue
|
|
continue
|
|
|
|
# --- Write aggregated text to a file ---
|
|
if all_texts_in_folder:
|
|
output_txt_filename = f"{folder_name}.txt"
|
|
output_txt_path = os.path.join(subdir_path, output_txt_filename)
|
|
|
|
try:
|
|
with open(output_txt_path, 'w', encoding='utf-8') as f:
|
|
for text in all_texts_in_folder:
|
|
f.write(text + '\n') # Write each text segment on a new line
|
|
print(f" 已将所有识别结果写入文件: {output_txt_path}")
|
|
except IOError as e:
|
|
print(f" 错误:无法写入输出文件 '{output_txt_path}'. {e}")
|
|
elif png_filenames_sorted: # Found PNGs but extracted no text
|
|
print(f" 文件夹 '{folder_name}' 中的 PNG 图片均未识别到文本,不创建 .txt 文件。")
|
|
# If png_filenames_sorted is empty, the message was printed earlier.
|
|
|
|
print("\n--- 所有文件夹处理完毕 ---")
|
|
|