diff --git a/pom.xml b/pom.xml index ae56d33..b2ec804 100644 --- a/pom.xml +++ b/pom.xml @@ -242,6 +242,68 @@ + + + package-python-static + process-resources + + copy-resources + + + ${project.build.directory}/classes/python-runtime + + + packaging/python/python-3.9-linux-static + + **/*.pyc + **/__pycache__/** + **/test/** + **/tests/** + + + + + + + + package-python-venv + process-resources + + copy-resources + + + ${project.build.directory}/classes/python-runtime/venv-offline + + + packaging/python/venv-linux-offline + + **/*.pyc + **/__pycache__/** + **/tests/** + **/test/** + **/*.md + **/*.dist-info/** + + + + + + + + package-ocr-models + process-resources + + copy-resources + + + ${project.build.directory}/classes/models + + + packaging/python/models + + + + diff --git a/src/main/java/com/chinaweal/youfool/reportdetect/modules/ocr/service/OcrService.java b/src/main/java/com/chinaweal/youfool/reportdetect/modules/ocr/service/OcrService.java index 130c84a..71c2ab8 100644 --- a/src/main/java/com/chinaweal/youfool/reportdetect/modules/ocr/service/OcrService.java +++ b/src/main/java/com/chinaweal/youfool/reportdetect/modules/ocr/service/OcrService.java @@ -1,116 +1,78 @@ package com.chinaweal.youfool.reportdetect.modules.ocr.service; -import ai.djl.inference.Predictor; -import ai.djl.modality.cv.Image; -import ai.djl.modality.cv.ImageFactory; -import ai.djl.modality.cv.output.DetectedObjects; -import ai.djl.modality.cv.output.Rectangle; -import ai.djl.repository.zoo.Criteria; -import ai.djl.repository.zoo.ZooModel; -import ai.djl.translate.TranslateException; import com.chinaweal.youfool.reportdetect.common.utils.CertUtils; -import com.chinaweal.youfool.reportdetect.common.utils.PdfUtils; import com.chinaweal.youfool.reportdetect.modules.task.entity.OCRResult; -import com.chinaweal.youfool.reportdetect.modules.ocr.utils.CmaTemplateExtractor; +import com.chinaweal.youfool.reportdetect.modules.ocr.client.FlaskOCRClient; +import com.chinaweal.youfool.reportdetect.modules.ocr.dto.FlaskOCRResponse; import com.chinaweal.youfool.reportdetect.modules.ocr.utils.InstitutionNameCleaner; import com.chinaweal.youfool.reportdetect.modules.ocr.utils.InstitutionNameSearcher; -import com.chinaweal.youfool.reportdetect.modules.ocr.utils.SealExtractor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Service; -import javax.annotation.PostConstruct; -import java.io.File; -import java.io.IOException; -import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; -import java.util.*; +import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; -import java.util.stream.Collectors; -import java.awt.image.BufferedImage; -import javax.imageio.ImageIO; +/** + * OCR Service - Python-First Architecture + * + * This service delegates all OCR processing to the Python Flask API. + * The Java OCR implementation has been removed in favor of the + * Python-based OCR engine which provides better accuracy and + * easier maintenance. + * + * Architecture: + * - Java Backend (Spring Boot) → Flask API (Python) → PaddleOCR + * - All OCR processing is done by the Python Flask API server + * - Java only handles business logic and database operations + * + * @author Claude Code + * @version 2.0 - Python-First Architecture + */ @Service public class OcrService { private static final Logger log = LoggerFactory.getLogger(OcrService.class); @Autowired - private LayoutDetectionService layoutService; + private FlaskOCRClient flaskOCRClient; - @Autowired - private PaddleOCRVLService paddleOCRVLService; - - @Autowired + @Autowired(required = false) private com.chinaweal.youfool.reportdetect.modules.ocr.engine.PythonOcrEngine pythonOcrEngine; - public void setLayoutService(LayoutDetectionService layoutService) { - this.layoutService = layoutService; - } + @Value("${app.ocr.engine:python}") + private String ocrEngineType; // python (recommended) or fallback - public void setPaddleOCRVLService(PaddleOCRVLService paddleOCRVLService) { - this.paddleOCRVLService = paddleOCRVLService; - } - - @Value("${app.ocr.mock:false}") - private boolean mockMode; - - @Value("${app.ocr.engine:java}") - private String ocrEngineType; // java or python - - private String vizPath; - - public void setVizPath(String vizPath) { - this.vizPath = vizPath; - } - - private static final Pattern CMA_PATTERN_1 = Pattern.compile("\\d{11}"); - private static final Pattern CMA_PATTERN_2 = Pattern.compile("\\d{12}"); - - private List recKeys = new ArrayList<>(); - private CmaTemplateExtractor cmaExtractor; - - private static final int MIN_POLYGONS_FOR_UNWARP = 3; - - @PostConstruct - public void init() { - try { - Path keyPath = Paths.get("src/main/resources/ppocr_keys_v1.txt"); - if (Files.exists(keyPath)) { - this.recKeys = Files.readAllLines(keyPath, StandardCharsets.UTF_8); - log.info("Loaded {} keys for OCR Recognition", recKeys.size()); - } - } catch (Exception e) { - log.warn("Failed to load OCR keys: {}", e.getMessage()); - } - - // Initialize CMA template extractor - this.cmaExtractor = new CmaTemplateExtractor(); - log.info("CMA Template Extractor initialized"); - } - - public static class OcrExecutionResult { - public String text = ""; - public List> sealResults = new ArrayList<>(); - public BufferedImage pageImage; // For CMA template matching - } + // Primary CMA pattern: 11-12 digits starting with '2' (matches CMA standard) + private static final Pattern CMA_PATTERN_PRIMARY = Pattern.compile("2\\d{10,11}"); + // Fallback patterns for edge cases + private static final Pattern CMA_PATTERN_FALLBACK_11 = Pattern.compile("\\d{11}"); + private static final Pattern CMA_PATTERN_FALLBACK_12 = Pattern.compile("\\d{12}"); + /** + * Process PDF and extract OCR information using Python Flask API. + * + * This is the main entry point for OCR processing. The flow is: + * 1. Try to extract institution name from digital certificate (CRT channel) + * 2. Delegate OCR processing to Python Flask API + * 3. Parse and return the OCR results + * + * @param pdfPath Path to the PDF file + * @param outputDir Directory for output files + * @return OCRResult containing extracted CMA code and institution name + */ public OCRResult processPdf(String pdfPath, String outputDir) { OCRResult result = new OCRResult(); - // Check if Python engine is enabled - if ("python".equalsIgnoreCase(ocrEngineType)) { - log.info("Using Python OCR Engine for: {} (Output: {})", pdfPath, outputDir); - return pythonOcrEngine.processPdf(pdfPath, outputDir); - } - - log.info("Starting Multi-Channel OCR Process (Python-Aligned) for: {}", pdfPath); + log.info("Starting OCR Process (Python-First Architecture) for: {}", pdfPath); + // Step 1: Try CRT channel (digital certificate extraction) try { List certOrgs = CertUtils.extractDigitalCertificateInfo(pdfPath); if (!certOrgs.isEmpty()) { @@ -122,336 +84,83 @@ public class OcrService { log.error("CRT channel failed", e); } - // Lazy Extraction: If CRT succeeded, we can skip expensive Seal/Layout steps - // But we still need full page OCR to extract CMA code (unless proper CMA - // extraction is implemented separately) - boolean skipSeals = (result.getExtractedOrg() != null && !result.getExtractedOrg().isEmpty()); - if (skipSeals) { - log.info("CRT Channel successful. Skipping Seal Extraction & Unwarping (Lazy Mode)."); - } - - OcrExecutionResult execResult = runOcrAlignmentFlow(pdfPath, skipSeals); - - // Extract CMA code using template matching (not regex) - String cmaCode = null; - if (execResult.pageImage != null && cmaExtractor != null) { - cmaCode = cmaExtractor.extractCmaCode(execResult.pageImage, img -> { - // OCR recognizer function for the CMA region - try { - return runOcrOnBufferedImage(img); - } catch (Exception e) { - log.error("OCR on CMA region failed", e); - return ""; - } - }); - if (cmaCode != null) { - log.info("✓ CMA code extracted via template matching: {}", cmaCode); - } else { - log.warn("✗ CMA template not found - Attempting Full Page Fallback"); - cmaCode = parseCmaCode(execResult.text); - if (cmaCode != null) { - log.info("✓ CMA code extracted via Full Page Fallback: {}", cmaCode); - } - } - } - - // Final fallback if still null (for cases where template match totally failed) - if (cmaCode == null) { - cmaCode = parseCmaCode(execResult.text); - if (cmaCode != null) { - log.info("✓ CMA code extracted via Full Page Fallback (Template skipped): {}", cmaCode); - } - } - - result.setExtractedCma(cmaCode); - result.setRawResult(Collections.singletonMap("seal_results", execResult.sealResults)); - - if (result.getExtractedOrg() == null || result.getExtractedOrg().isEmpty()) { - for (Map seal : execResult.sealResults) { - if (Boolean.TRUE.equals(seal.get("success"))) { - String org = InstitutionNameCleaner.clean((String) seal.get("text")); - if (org != null && !org.isEmpty()) { - log.info("✓ Found Organization from Seal OCR Channel: {}", org); - result.setExtractedOrg(org); - break; - } - } - } - } - - if (result.getExtractedOrg() == null || result.getExtractedOrg().isEmpty()) { - List foundInsts = InstitutionNameSearcher.search(execResult.text); - if (!foundInsts.isEmpty()) { - String org = InstitutionNameCleaner.clean(foundInsts.get(0)); - log.info("✓ Found Organization from Full OCR Search Channel: {}", org); - result.setExtractedOrg(org); - } - } - - if (result.getExtractedOrg() != null && !result.getExtractedOrg().isEmpty()) { - result.setApiStatus("PASS"); - } else { - log.error("✗ Failed to extract Institution Name after all channels."); + // Step 2: Check if Flask OCR client is available + if (flaskOCRClient == null) { + log.error("FlaskOCRClient is not available. Check Spring configuration."); result.setApiStatus("FAIL"); + return result; } - return result; - } - - public OcrExecutionResult runOcr(String pdfPath) { - return runOcrAlignmentFlow(pdfPath, false); - } - - public OcrExecutionResult runOcrAlignmentFlow(String pdfPath, boolean skipSeals) { - OcrExecutionResult result = new OcrExecutionResult(); - StringBuilder fullPageText = new StringBuilder(); - - try { - Path tempDir; - if (this.vizPath != null && !this.vizPath.isEmpty()) { - tempDir = Paths.get(this.vizPath); - } else { - tempDir = Paths.get("data", "temp_ocr_" + System.currentTimeMillis()); - } - Files.createDirectories(tempDir); - // Limit to 1 page extraction - List> pages = PdfUtils.pdfToImages(pdfPath, tempDir.toString(), "temp", 1); - - Criteria detCriteria = Criteria.builder() - .setTypes(Image.class, DetectedObjects.class) - .optModelPath(Paths.get("models/pp-ocrv5/PP-OCRv5_server_det_onnx/inference.onnx")) - .optEngine("OnnxRuntime") - .optTranslator(new CustomDetectionTranslator()) - .build(); - - Criteria recCriteria = Criteria.builder() - .setTypes(Image.class, String.class) - .optModelPath(Paths.get("models/pp-ocrv5/PP-OCRv5_server_rec_onnx/inference.onnx")) - .optEngine("OnnxRuntime") - .optTranslator(new CustomRecognitionTranslator(this.recKeys)) - .build(); - - try (ZooModel detModel = detCriteria.loadModel(); - Predictor detector = detModel.newPredictor(); - ZooModel recModel = recCriteria.loadModel(); - Predictor recognizer = recModel.newPredictor()) { - - for (int pageIdx = 0; pageIdx < pages.size(); pageIdx++) { - String imgPath = (String) pages.get(pageIdx).get("image_path"); - Image img = ImageFactory.getInstance().fromFile(Paths.get(imgPath)); - - // Store page image for CMA template matching - if (pageIdx == 0) { - result.pageImage = ImageIO.read(Paths.get(imgPath).toFile()); - } - - // Skip Layout/Seal processing if requested (Lazy Extraction) - if (!skipSeals) { - List layoutItems = layoutService.getAllDetections(img); - List sealRegions = layoutItems.stream() - .filter(obj -> "seal".equals(obj.getClassName()) || "image".equals(obj.getClassName())) - .collect(Collectors.toList()); - - for (DetectedObjects.DetectedObject sealRegion : sealRegions) { - Rectangle box = sealRegion.getBoundingBox().getBounds(); - int sx = (int) (box.getX() * img.getWidth()); - int sy = (int) (box.getY() * img.getHeight()); - int sw = (int) (box.getWidth() * img.getWidth()); - int sh = (int) (box.getHeight() * img.getHeight()); - - sx = Math.max(0, sx); - sy = Math.max(0, sy); - sw = Math.min(sw, img.getWidth() - sx); - sh = Math.min(sh, img.getHeight() - sy); - if (sw < 10 || sh < 10) - continue; - - Image sealCrop = img.getSubImage(sx, sy, sw, sh); - DetectedObjects textDetections = detector.predict(sealCrop); - List points = parsePoints(textDetections); - - java.awt.image.BufferedImage awtSeal = toBufferedImage(sealCrop); - SealExtractor.SealCandidate sealInfo = SealExtractor.detectRedSeal(awtSeal); - - java.awt.Point center = (sealInfo != null) ? sealInfo.center - : new java.awt.Point(awtSeal.getWidth() / 2, awtSeal.getHeight() / 2); - int radius = (sealInfo != null) ? sealInfo.radius - : Math.min(awtSeal.getWidth(), awtSeal.getHeight()) / 2; - - java.awt.image.BufferedImage unwarped = null; - if (points.size() >= MIN_POLYGONS_FOR_UNWARP) { - unwarped = SealExtractor.polarUnwarpSmart(awtSeal, center, radius, points); - } else { - unwarped = SealExtractor.polarUnwarp(awtSeal, center, radius, 7.5); - } - - String extractedText = ""; - float confidence = 0.0f; - boolean success = false; - - if (unwarped != null) { - String recRaw = recognizer.predict(fromBufferedImage(unwarped)); - if (recRaw != null && recRaw.contains("|||")) { - String[] parts = recRaw.split("\\|\\|\\|"); - extractedText = parts[0].trim(); - confidence = Float.parseFloat(parts[1]); - if (confidence > 0.8) - success = true; - } - } - - // Backup flow - if (!success && paddleOCRVLService.isAvailable()) { - Path backupPath = tempDir.resolve("backup_" + System.currentTimeMillis() + ".png"); - sealCrop.save(Files.newOutputStream(backupPath), "png"); - PaddleOCRVLService.PaddleOCRVLResult vlRes = paddleOCRVLService - .recognizeSealText(backupPath.toFile()); - if (vlRes.isSuccess()) { - extractedText = vlRes.getText(); - confidence = (float) vlRes.getConfidence(); - success = true; - } - } - - if (success) { - Map sealDetail = new HashMap<>(); - sealDetail.put("text", extractedText); - sealDetail.put("confidence", confidence); - sealDetail.put("success", true); - result.sealResults.add(sealDetail); - fullPageText.append("SEAL_TEXT: ").append(extractedText).append("\n"); - } - } - } - - // Always run Full Page OCR for CMA code Extraction & Fallback Search - DetectedObjects pageText = detector.predict(img); - for (ai.djl.modality.Classifications.Classification c : pageText.items()) { - if (c instanceof DetectedObjects.DetectedObject) { - Rectangle b = ((DetectedObjects.DetectedObject) c).getBoundingBox().getBounds(); - Image block = img.getSubImage((int) (b.getX() * img.getWidth()), - (int) (b.getY() * img.getHeight()), - (int) (b.getWidth() * img.getWidth()), (int) (b.getHeight() * img.getHeight())); - String t = recognizer.predict(block); - if (t != null && t.contains("|||")) { - fullPageText.append(t.split("\\|\\|\\|")[0]).append(" "); - } - } - } - fullPageText.append("\n"); - } - } - - result.text = fullPageText.toString(); - - } catch (Exception e) { - log.error("OCR Alignment Flow failed", e); + // Step 3: Check Flask server health + if (!flaskOCRClient.isHealthy()) { + log.error("Flask OCR server is not available. Please start the Flask API server."); + result.setApiStatus("FAIL"); + return result; } - return result; - } + // Step 4: Delegate OCR processing to Python Flask API + log.info("Delegating OCR processing to Python Flask API"); + FlaskOCRResponse flaskResponse = flaskOCRClient.processPdf(pdfPath, outputDir); - private List parsePoints(DetectedObjects detections) { - List points = new ArrayList<>(); - for (ai.djl.modality.Classifications.Classification item : detections.items()) { - if (item instanceof DetectedObjects.DetectedObject) { - String cls = ((DetectedObjects.DetectedObject) item).getClassName(); - if (cls != null && cls.startsWith("text_points:")) { - String data = cls.substring("text_points:".length()); - for (String pStr : data.split(";")) { - if (pStr.contains(",")) { - String[] coords = pStr.split(","); - points.add(new int[] { Integer.parseInt(coords[0]), Integer.parseInt(coords[1]) }); - } - } + // Step 5: Parse Flask response + if (flaskResponse.isSuccess()) { + result.setExtractedCma(flaskResponse.getCmaCode()); + result.setExtractedOrg(flaskResponse.getInstitutionName()); + result.setApiStatus("PASS"); + + log.info("✓ OCR processing completed successfully"); + log.info(" CMA Code: {}", flaskResponse.getCmaCode()); + log.info(" Institution: {}", flaskResponse.getInstitutionName()); + log.info(" Confidence: {}", flaskResponse.getConfidence()); + } else { + result.setApiStatus("FAIL"); + log.error("✗ Flask OCR processing failed: {}", flaskResponse.getError()); + + // Fallback: Try to use PythonOcrEngine if available + if (pythonOcrEngine != null && "fallback".equalsIgnoreCase(ocrEngineType)) { + log.warn("Attempting fallback to PythonOcrEngine..."); + try { + return pythonOcrEngine.processPdf(pdfPath, outputDir); + } catch (Exception e) { + log.error("PythonOcrEngine fallback also failed", e); } } } - return points; - } - private java.awt.image.BufferedImage toBufferedImage(Image img) throws Exception { - java.io.ByteArrayOutputStream bos = new java.io.ByteArrayOutputStream(); - img.save(bos, "png"); - return javax.imageio.ImageIO.read(new java.io.ByteArrayInputStream(bos.toByteArray())); - } - - private Image fromBufferedImage(java.awt.image.BufferedImage awt) throws Exception { - java.io.ByteArrayOutputStream os = new java.io.ByteArrayOutputStream(); - javax.imageio.ImageIO.write(awt, "png", os); - return ImageFactory.getInstance().fromInputStream(new java.io.ByteArrayInputStream(os.toByteArray())); + return result; } /** - * Run OCR on a BufferedImage and return text. - * Used for CMA template matching OCR. + * Parse CMA code from text using regex patterns. + * + * This method is kept for validation purposes. The actual CMA extraction + * is done by the Python Flask API using template matching. + * + * @param text Text to search for CMA code + * @return CMA code if found, null otherwise */ - private String runOcrOnBufferedImage(BufferedImage img) { - try { - Image djlImg = fromBufferedImage(img); - - Criteria detCriteria = Criteria.builder() - .setTypes(Image.class, DetectedObjects.class) - .optModelPath(Paths.get("models/pp-ocrv5/PP-OCRv5_server_det_onnx/inference.onnx")) - .optEngine("OnnxRuntime") - .optTranslator(new CustomDetectionTranslator()) - .build(); - - Criteria recCriteria = Criteria.builder() - .setTypes(Image.class, String.class) - .optModelPath(Paths.get("models/pp-ocrv5/PP-OCRv5_server_rec_onnx/inference.onnx")) - .optEngine("OnnxRuntime") - .optTranslator(new CustomRecognitionTranslator(this.recKeys)) - .build(); - - StringBuilder textBuilder = new StringBuilder(); - try (ZooModel detModel = detCriteria.loadModel(); - Predictor detector = detModel.newPredictor(); - ZooModel recModel = recCriteria.loadModel(); - Predictor recognizer = recModel.newPredictor()) { - - DetectedObjects detections = detector.predict(djlImg); - for (ai.djl.modality.Classifications.Classification c : detections.items()) { - if (c instanceof DetectedObjects.DetectedObject) { - Rectangle b = ((DetectedObjects.DetectedObject) c).getBoundingBox().getBounds(); - int cx = (int) (b.getX() * djlImg.getWidth()); - int cy = (int) (b.getY() * djlImg.getHeight()); - int cw = (int) (b.getWidth() * djlImg.getWidth()); - int ch = (int) (b.getHeight() * djlImg.getHeight()); - cx = Math.max(0, cx); - cy = Math.max(0, cy); - cw = Math.min(cw, djlImg.getWidth() - cx); - ch = Math.min(ch, djlImg.getHeight() - cy); - if (cw > 5 && ch > 5) { - Image crop = djlImg.getSubImage(cx, cy, cw, ch); - String recRaw = recognizer.predict(crop); - if (recRaw != null && recRaw.contains("|||")) { - String[] parts = recRaw.split("\\|\\|\\|"); - textBuilder.append(parts[0]).append(" "); - } - } - } - } - } - return textBuilder.toString().trim(); - } catch (Exception e) { - log.error("runOcrOnBufferedImage failed", e); - return ""; - } - } - public String parseCmaCode(String text) { if (text == null || text.isEmpty()) return null; String clean = text.replace(" ", "").replace("\n", ""); - List candidates = new ArrayList<>(); - Matcher m1 = CMA_PATTERN_1.matcher(clean); - while (m1.find()) - candidates.add(m1.group()); + + // Try primary pattern first (CMA standard: starts with '2') + Matcher mPrimary = CMA_PATTERN_PRIMARY.matcher(clean); + if (mPrimary.find()) { + return mPrimary.group(); + } + + // Fallback to any 11-12 digit pattern + List candidates = new java.util.ArrayList<>(); + Matcher m11 = CMA_PATTERN_FALLBACK_11.matcher(clean); + while (m11.find()) { + candidates.add(m11.group()); + } if (candidates.isEmpty()) { - Matcher m2 = CMA_PATTERN_2.matcher(clean); - while (m2.find()) - candidates.add(m2.group()); + Matcher m12 = CMA_PATTERN_FALLBACK_12.matcher(clean); + while (m12.find()) { + candidates.add(m12.group()); + } } return candidates.isEmpty() ? null : candidates.get(0); } diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index 3ab4d3b..61a5b79 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -65,18 +65,26 @@ app: preview-dir: ./data/previews attachment-dir: ./data/attachments ocr: - mock: false - engine: java - # Python Bridge Configuration + # OCR Engine Configuration + # 'python' - Use Flask API (recommended, Python-First architecture) + # 'java' - Deprecated: Use Java direct OCR implementation + # 'fallback' - Use Python as primary, Java as fallback + engine: python + + # Flask OCR API Configuration (Python-First Architecture) + flask: + enabled: true + baseUrl: http://127.0.0.1:8081 + timeout: 300000 # 5 minutes timeout for OCR processing + # Flask Process Manager Configuration (for auto-starting Flask) + host: 127.0.0.1 + port: 8081 + startup-timeout: 60 # seconds to wait for Flask to be ready + + # Python Bridge Configuration (Legacy - for direct process calls) python: command: python script: ocr_bridge_cross_platform.py - # Flask OCR API Configuration - flask: - enabled: false - host: 127.0.0.1 - port: 8081 - startup-timeout: 60 # Resource Directories resource-dir: ./ocr-resources models-dir: ./models