diff --git a/pom.xml b/pom.xml
index 8b17884..85979f9 100644
--- a/pom.xml
+++ b/pom.xml
@@ -171,6 +171,11 @@
jsoup
1.17.2
+
+ org.apache.commons
+ commons-compress
+ 1.26.1
+
@@ -247,9 +252,9 @@
-
+
- package-python-static
+ package-python-archives
process-resources
copy-resources
@@ -258,37 +263,11 @@
${project.build.directory}/classes/python-runtime
- packaging/python/python-3.9-linux-static
-
- **/*.pyc
- **/__pycache__/**
- **/test/**
- **/tests/**
-
-
-
-
-
-
-
- package-python-venv
- process-resources
-
- copy-resources
-
-
- ${project.build.directory}/classes/python-runtime/venv-offline
-
-
- packaging/python/venv-linux-offline
-
- **/*.pyc
- **/__pycache__/**
- **/tests/**
- **/test/**
- **/*.md
- **/*.dist-info/**
-
+ packaging/python
+
+ python-runtime.tar.gz
+ venv-offline.tar.gz
+
diff --git a/requirements-offline.txt b/requirements-offline.txt
index 588a116..f21ef50 100644
--- a/requirements-offline.txt
+++ b/requirements-offline.txt
@@ -1,11 +1,11 @@
# Offline Python Requirements - Fixed Versions for Reproducibility
# Generated: 2026-03-04
-# Target: Linux x86_64, Python 3.9
+# Target: Linux x86_64, Python 3.10
# Core OCR Dependencies
paddleocr==2.7.5
-paddlepaddle==2.5.2
-opencv-python==4.8.0.76
+paddlepaddle==2.6.2
+opencv-python==4.6.0.66
pymupdf==1.23.0
pikepdf==8.0.0
diff --git a/src/main/java/com/chinaweal/youfool/reportdetect/common/utils/ResourceExtractor.java b/src/main/java/com/chinaweal/youfool/reportdetect/common/utils/ResourceExtractor.java
new file mode 100644
index 0000000..0b14894
--- /dev/null
+++ b/src/main/java/com/chinaweal/youfool/reportdetect/common/utils/ResourceExtractor.java
@@ -0,0 +1,471 @@
+package com.chinaweal.youfool.reportdetect.common.utils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.core.io.Resource;
+import org.springframework.core.io.support.PathMatchingResourcePatternResolver;
+import org.springframework.core.io.support.ResourcePatternResolver;
+import org.springframework.stereotype.Component;
+
+import javax.annotation.PostConstruct;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.*;
+import java.nio.file.attribute.BasicFileAttributes;
+import java.util.Enumeration;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.jar.JarEntry;
+import java.util.jar.JarFile;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+
+/**
+ * Extracts Python resources and model files from JAR to working directory
+ */
+@Component
+public class ResourceExtractor {
+
+ private static final Logger log = LoggerFactory.getLogger(ResourceExtractor.class);
+
+ @Value("${app.ocr.resource-dir:./ocr-resources}")
+ private String resourceDir;
+
+ @Value("${app.ocr.models-dir:./models}")
+ private String modelsDir;
+
+ @Value("${app.ocr.extract-on-startup:true}")
+ private boolean extractOnStartup;
+
+ @PostConstruct
+ public void init() {
+ if (extractOnStartup) {
+ try {
+ extractIfMissing();
+ } catch (IOException e) {
+ log.error("Failed to extract resources on startup", e);
+ }
+ }
+ }
+
+ /**
+ * Extract Python code and models if they don't exist
+ */
+ public void extractIfMissing() throws IOException {
+ extractPythonCode();
+ extractModels();
+ }
+
+ /**
+ * Extract all offline resources (Python runtime, venv, models, scripts)
+ * Used for complete offline deployment
+ */
+ public void extractAllResources() throws IOException {
+ log.info("========================================");
+ log.info("开始提取离线资源");
+ log.info("========================================");
+
+ long startTime = System.currentTimeMillis();
+
+ // 1. Extract Python runtime
+ extractPythonRuntime();
+
+ // 2. Extract virtual environment
+ extractVirtualEnvironment();
+
+ // 3. Extract OCR models
+ extractOcrModels();
+
+ // 4. Extract Python scripts
+ extractPythonCode();
+
+ long duration = System.currentTimeMillis() - startTime;
+ log.info("========================================");
+ log.info("✓ 离线资源提取完成 (耗时: {}秒)", duration / 1000.0);
+ log.info("========================================");
+ }
+
+ /**
+ * Extract Python API code from JAR to working directory
+ */
+ private void extractPythonCode() throws IOException {
+ Path targetDir = Paths.get(resourceDir);
+
+ if (Files.exists(targetDir) &&
+ Files.list(targetDir).findAny().isPresent()) {
+ log.info("Python resources already exist at {}", targetDir);
+ return;
+ }
+
+ log.info("Extracting Python resources to {}", targetDir);
+ Files.createDirectories(targetDir);
+
+ // List of Python resources to extract
+ Map pythonResources = new HashMap<>();
+ pythonResources.put("python_api/ocr_api_server.py", "ocr_api_server.py");
+ pythonResources.put("python_api/ocr_task_consumer.py", "ocr_task_consumer.py");
+ pythonResources.put("src/main/python/pdf_processor.py", "pdf_processor.py");
+ pythonResources.put("test_accuracy_batch_full.py", "test_accuracy_batch_full.py");
+
+ for (Map.Entry entry : pythonResources.entrySet()) {
+ String sourcePath = entry.getKey();
+ String targetName = entry.getValue();
+ extractResource(sourcePath, targetDir.resolve(targetName));
+ }
+
+ log.info("Python resources extracted successfully");
+ }
+
+ /**
+ * Extract model files from JAR to working directory
+ * NOTE: This is a placeholder - actual model extraction depends on how models are packaged
+ * For large models (3-5GB), consider:
+ * 1. External model download on first run
+ * 2. Separate model package
+ * 3. Docker volume mount
+ */
+ private void extractModels() throws IOException {
+ Path targetDir = Paths.get(modelsDir);
+
+ if (Files.exists(targetDir)) {
+ // Check if essential models exist
+ Path[] essentialModels = {
+ targetDir.resolve("pp-ocrv5/PP-OCRv5_server_det_onnx/inference.onnx"),
+ targetDir.resolve("pp-ocrv5/PP-OCRv5_server_rec_onnx/inference.onnx")
+ };
+
+ boolean allExist = true;
+ for (Path model : essentialModels) {
+ if (!Files.exists(model)) {
+ allExist = false;
+ break;
+ }
+ }
+
+ if (allExist) {
+ log.info("Models directory already exists with essential models at {}", targetDir);
+ return;
+ }
+ }
+
+ log.warn("Models directory missing or incomplete at {}", targetDir);
+ log.info("Models will be auto-downloaded by PaddleOCR on first use");
+
+ // Create directory structure
+ Files.createDirectories(targetDir);
+
+ // NOTE: For production, implement one of these strategies:
+ //
+ // Strategy 1: Download from CDN
+ // downloadModelsFromCdn(targetDir);
+ //
+ // Strategy 2: Extract from JAR (if bundled)
+ // extractModelsFromJar(targetDir);
+ //
+ // Strategy 3: Docker volume mount (recommended)
+ // Models are mounted as volume at runtime
+ }
+
+ /**
+ * Extract a single resource from classpath to target path
+ */
+ private void extractResource(String resourcePath, Path targetPath) throws IOException {
+ InputStream is = getClass().getClassLoader().getResourceAsStream(resourcePath);
+
+ if (is == null) {
+ log.warn("Resource not found in classpath: {}", resourcePath);
+
+ // Try extracting from JAR directly
+ if (isRunningInJar()) {
+ extractFromJar(resourcePath, targetPath);
+ }
+ return;
+ }
+
+ Files.copy(is, targetPath, StandardCopyOption.REPLACE_EXISTING);
+ is.close();
+ log.debug("Extracted: {} -> {}", resourcePath, targetPath);
+ }
+
+ /**
+ * Extract file from JAR (for resources not found via classpath)
+ */
+ private void extractFromJar(String resourcePath, Path targetPath) throws IOException {
+ String jarPath = getClass().getProtectionDomain().getCodeSource().getLocation().getPath();
+
+ if (jarPath == null || !jarPath.endsWith(".jar")) {
+ return;
+ }
+
+ try (JarFile jarFile = new JarFile(jarPath)) {
+ Enumeration entries = jarFile.entries();
+
+ while (entries.hasMoreElements()) {
+ JarEntry entry = entries.nextElement();
+ String entryName = entry.getName();
+
+ if (entryName.equals(resourcePath) || entryName.endsWith("/" + resourcePath)) {
+ try (InputStream is = jarFile.getInputStream(entry)) {
+ Files.createDirectories(targetPath.getParent());
+ Files.copy(is, targetPath, StandardCopyOption.REPLACE_EXISTING);
+ log.debug("Extracted from JAR: {} -> {}", entryName, targetPath);
+ return;
+ }
+ }
+ }
+ }
+
+ log.warn("Resource not found in JAR: {}", resourcePath);
+ }
+
+ /**
+ * Check if application is running from JAR
+ */
+ private boolean isRunningInJar() {
+ String protocol = getClass().getProtectionDomain().getCodeSource().getLocation().getProtocol();
+ return "jar".equals(protocol);
+ }
+
+ /**
+ * Recursively delete a directory
+ */
+ public void deleteDirectory(Path directory) throws IOException {
+ if (Files.exists(directory)) {
+ Files.walkFileTree(directory, new SimpleFileVisitor() {
+ @Override
+ public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
+ Files.delete(file);
+ return FileVisitResult.CONTINUE;
+ }
+
+ @Override
+ public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException {
+ Files.delete(dir);
+ return FileVisitResult.CONTINUE;
+ }
+ });
+ }
+ }
+
+ public String getResourceDir() {
+ return resourceDir;
+ }
+
+ public String getModelsDir() {
+ return modelsDir;
+ }
+
+ public boolean isExtractOnStartup() {
+ return extractOnStartup;
+ }
+
+ // ========== Offline Deployment Methods ==========
+
+ private void extractPythonRuntime() throws IOException {
+ Path targetDir = Paths.get(resourceDir, "python-runtime");
+
+ if (Files.exists(targetDir.resolve("python/bin/python3.10")) ||
+ Files.exists(targetDir.resolve("python/bin/python3"))) {
+ log.info("[1/4] Python runtime already exists, skipping");
+ return;
+ }
+
+ log.info("[1/4] Extracting Python runtime...");
+ Files.createDirectories(targetDir);
+ if (!extractTarGzResource("python-runtime/python-runtime.tar.gz", targetDir)) {
+ copyDirectoryFromClasspath("python-runtime", targetDir);
+ }
+ makeExecutable(targetDir.resolve("python/bin/python3.10"));
+ makeExecutable(targetDir.resolve("python/bin/python3"));
+ log.info(" Done");
+ }
+
+ private void extractVirtualEnvironment() throws IOException {
+ Path targetDir = Paths.get(resourceDir, "python-runtime", "venv-offline");
+
+ if (Files.exists(targetDir.resolve("bin/python3.10")) ||
+ Files.exists(targetDir.resolve("bin/python3"))) {
+ log.info("[2/4] Python venv already exists, skipping");
+ return;
+ }
+
+ log.info("[2/4] Extracting Python venv...");
+ Files.createDirectories(targetDir.getParent());
+ if (!extractTarGzResource("python-runtime/venv-offline.tar.gz", targetDir.getParent())) {
+ Files.createDirectories(targetDir);
+ copyDirectoryFromClasspath("python-runtime/venv-offline", targetDir);
+ }
+ makeExecutable(targetDir.resolve("bin/python3.10"));
+ makeExecutable(targetDir.resolve("bin/python3"));
+ log.info(" Done");
+ }
+
+ private void extractOcrModels() throws IOException {
+ Path targetDir = Paths.get(modelsDir);
+
+ if (Files.exists(targetDir.resolve("pp-ocrv5/det_model/inference.onnx"))) {
+ log.info("[3/4] OCR模型已存在,跳过");
+ return;
+ }
+
+ log.info("[3/4] 提取OCR模型...");
+ Files.createDirectories(targetDir);
+ copyDirectoryFromClasspath("models", targetDir);
+ log.info(" ✓ 完成");
+ }
+
+ private void copyDirectoryFromClasspath(String resourcePath, Path targetDir) throws IOException {
+ log.info(" 复制资源: {}", resourcePath);
+
+ ResourcePatternResolver resolver = new PathMatchingResourcePatternResolver();
+ Resource[] resources = resolver.getResources("classpath*:" + resourcePath + "/**");
+
+ int fileCount = 0;
+ for (Resource resource : resources) {
+ if (resource.isReadable() && !resource.getURL().toString().endsWith("/")) {
+ try {
+ String path = resource.getURL().getPath();
+ String relativePath = path.substring(path.indexOf(resourcePath));
+ Path targetPath = targetDir.resolve(relativePath.substring(resourcePath.length()));
+
+ Files.createDirectories(targetPath.getParent());
+ Files.copy(resource.getInputStream(), targetPath,
+ StandardCopyOption.REPLACE_EXISTING);
+ fileCount++;
+ } catch (Exception e) {
+ log.debug("跳过文件: {}", resource.getFilename());
+ }
+ }
+ }
+ log.info(" 复制了 {} 个文件", fileCount);
+ }
+
+ /**
+ * Extract a .tar.gz resource from classpath into target directory.
+ * Returns true if extracted, false if resource not found.
+ */
+ private boolean extractTarGzResource(String resourceName, Path targetDir) throws IOException {
+ ResourcePatternResolver resolver = new PathMatchingResourcePatternResolver();
+ Resource resource = resolver.getResource("classpath:" + resourceName);
+ if (!resource.exists()) {
+ return false;
+ }
+
+ log.info(" Extracting archive: {}", resourceName);
+ Files.createDirectories(targetDir);
+
+ // Read to a temp file first to avoid nested-jar stream issues
+ Path tmp = Files.createTempFile("ocr-archive-", ".tar.gz");
+ try (InputStream is = resource.getInputStream()) {
+ Files.copy(is, tmp, StandardCopyOption.REPLACE_EXISTING);
+ }
+
+ long tmpSize = Files.size(tmp);
+ if (tmpSize <= 0) {
+ log.warn(" Archive size is 0 bytes: {}", resourceName);
+ try {
+ Files.deleteIfExists(tmp);
+ } catch (IOException ignore) {
+ }
+ return false;
+ }
+ log.info(" Archive temp file: {} ({} bytes)", tmp.toAbsolutePath(), tmpSize);
+
+ // Prefer system tar on Linux for reliability
+ if (isLinux()) {
+ try {
+ ProcessBuilder pb = new ProcessBuilder(
+ "tar", "-xzf", tmp.toAbsolutePath().toString(), "-C", targetDir.toAbsolutePath().toString());
+ pb.redirectErrorStream(true);
+ Process p = pb.start();
+ StringBuilder out = new StringBuilder();
+ try (java.io.BufferedReader br = new java.io.BufferedReader(
+ new java.io.InputStreamReader(p.getInputStream()))) {
+ String line;
+ while ((line = br.readLine()) != null) {
+ out.append(line).append('\n');
+ }
+ }
+ int code = p.waitFor();
+ if (code == 0) {
+ log.info(" System tar extraction succeeded: {}", resourceName);
+ try {
+ Files.deleteIfExists(tmp);
+ } catch (IOException ignore) {
+ }
+ return true;
+ } else {
+ log.warn(" System tar extraction failed (code {}): {}", code, resourceName);
+ if (out.length() > 0) {
+ log.warn(" System tar output: {}", out.toString().trim());
+ }
+ }
+ } catch (Exception e) {
+ log.warn(" System tar extraction error: {}", e.getMessage());
+ }
+ }
+
+ int extracted = 0;
+ try (InputStream fis = Files.newInputStream(tmp);
+ GZIPInputStream gis = new GZIPInputStream(fis);
+ TarArchiveInputStream tis = new TarArchiveInputStream(gis)) {
+
+ TarArchiveEntry entry;
+ while ((entry = tis.getNextTarEntry()) != null) {
+ String entryName = entry.getName();
+ if (entryName == null || entryName.isEmpty()) {
+ continue;
+ }
+ Path outPath = targetDir.resolve(entryName).normalize();
+ if (!outPath.startsWith(targetDir)) {
+ continue;
+ }
+ if (entry.isDirectory()) {
+ Files.createDirectories(outPath);
+ continue;
+ }
+ if (entry.isSymbolicLink() || entry.isLink()) {
+ // Skip symlinks to keep Windows compatibility
+ continue;
+ }
+ Files.createDirectories(outPath.getParent());
+ Files.copy(tis, outPath, StandardCopyOption.REPLACE_EXISTING);
+ extracted++;
+ }
+ }
+ if (extracted == 0) {
+ log.warn(" Archive extracted 0 files: {}", resourceName);
+ } else {
+ log.info(" Extracted {} files from {}", extracted, resourceName);
+ }
+ try {
+ Files.deleteIfExists(tmp);
+ } catch (IOException ignore) {
+ }
+ return true;
+ }
+
+ private boolean isLinux() {
+ String os = System.getProperty("os.name").toLowerCase();
+ return os.contains("linux");
+ }
+
+ private void makeExecutable(Path file) {
+ if (Files.exists(file)) {
+ try {
+ file.toFile().setExecutable(true);
+ // Also try chmod on Linux/Unix
+ if (System.getProperty("os.name").toLowerCase().contains("linux") ||
+ System.getProperty("os.name").toLowerCase().contains("unix") ||
+ System.getProperty("os.name").toLowerCase().contains("mac")) {
+ Runtime.getRuntime().exec(new String[]{"chmod", "+x", file.toString()});
+ }
+ } catch (Exception e) {
+ log.warn("无法设置可执行权限: {}", file);
+ }
+ }
+ }
+}
diff --git a/src/main/java/com/chinaweal/youfool/reportdetect/modules/ocr/engine/FlaskProcessManager.java b/src/main/java/com/chinaweal/youfool/reportdetect/modules/ocr/engine/FlaskProcessManager.java
new file mode 100644
index 0000000..7bd12e0
--- /dev/null
+++ b/src/main/java/com/chinaweal/youfool/reportdetect/modules/ocr/engine/FlaskProcessManager.java
@@ -0,0 +1,424 @@
+package com.chinaweal.youfool.reportdetect.modules.ocr.engine;
+
+import com.chinaweal.youfool.reportdetect.common.utils.ResourceExtractor;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.context.ApplicationListener;
+import org.springframework.context.event.ContextRefreshedEvent;
+import org.springframework.stereotype.Component;
+
+import javax.annotation.PreDestroy;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Manages Flask OCR API process lifecycle
+ * - Auto-starts Flask on application startup
+ * - Monitors Flask health
+ * - Gracefully shuts down on application exit
+ */
+@Component
+public class FlaskProcessManager implements ApplicationListener {
+
+ private static final Logger log = LoggerFactory.getLogger(FlaskProcessManager.class);
+
+ @Value("${app.ocr.flask.enabled:true}")
+ private boolean flaskEnabled;
+
+ @Value("${app.ocr.flask.host:127.0.0.1}")
+ private String flaskHost;
+
+ @Value("${app.ocr.flask.port:8081}")
+ private int flaskPort;
+
+ @Value("${app.ocr.flask.startup-timeout:60}")
+ private int startupTimeoutSeconds;
+
+ @Value("${app.ocr.flask.disable-model-source-check:true}")
+ private boolean disableModelSourceCheck;
+
+ @Value("${app.ocr.python-command:python}")
+ private String pythonCommand;
+
+ @Value("${app.ocr.resource-dir:./ocr-resources}")
+ private String resourceDir;
+
+ @Value("${app.ocr.models-dir:./models}")
+ private String modelsDir;
+
+ @Autowired
+ private ResourceExtractor resourceExtractor;
+
+ private Process flaskProcess;
+ private boolean flaskReady = false;
+
+ @Override
+ public void onApplicationEvent(ContextRefreshedEvent event) {
+ if (flaskEnabled) {
+ startFlaskProcess();
+ } else {
+ log.info("Flask process management is disabled");
+ }
+ }
+
+ /**
+ * Start Flask API server with embedded Python (offline mode)
+ */
+ public synchronized void startFlaskProcess() {
+ if (!flaskEnabled) {
+ log.info("Flask process management is disabled");
+ return;
+ }
+ if (flaskProcess != null && flaskProcess.isAlive()) {
+ log.info("Flask OCR API server is already running");
+ return;
+ }
+ log.info("Starting Flask OCR API server...");
+
+ try {
+ // Extract all resources for offline mode
+ resourceExtractor.extractAllResources();
+
+ // Determine Python executable and Flask API script path
+ String pythonExecutable;
+ Path apiScriptPath;
+
+ // Require embedded Python (Linux + Windows layouts)
+ Path embeddedPython = resolveEmbeddedPython();
+ if (embeddedPython != null) {
+ pythonExecutable = embeddedPython.toString();
+ log.info("Using embedded Python: {}", pythonExecutable);
+ } else {
+ log.error("Embedded Python not found. Refusing to start Flask with system Python.");
+ log.error("Expected embedded runtime under: {}/python-runtime/python", resourceDir);
+ return;
+ }
+
+ // Try multiple locations for Flask API script
+ // Priority 1: Project's python_api directory (development)
+ apiScriptPath = Paths.get("./python_api/ocr_api_server.py");
+ if (Files.exists(apiScriptPath)) {
+ log.info("Found Flask API script at: {}", apiScriptPath);
+ } else {
+ // Priority 2: Embedded in ocr-resources (offline deployment)
+ apiScriptPath = Paths.get(resourceDir, "python-api/ocr_api_server.py");
+ if (Files.exists(apiScriptPath)) {
+ log.info("Found Flask API script at: {}", apiScriptPath);
+ } else {
+ // Priority 3: Root of ocr-resources
+ apiScriptPath = Paths.get(resourceDir, "ocr_api_server.py");
+ if (Files.exists(apiScriptPath)) {
+ log.info("Found Flask API script at: {}", apiScriptPath);
+ } else {
+ log.error("Flask API script not found at any of the following locations:");
+ log.error(" 1. ./python_api/ocr_api_server.py");
+ log.error(" 2. {}/python-api/ocr_api_server.py", resourceDir);
+ log.error(" 3. {}/ocr_api_server.py", resourceDir);
+ return;
+ }
+ }
+ }
+
+ // Build command
+ List command = new ArrayList<>();
+ command.add(pythonExecutable);
+ command.add(apiScriptPath.toAbsolutePath().toString());
+
+ // Configure ProcessBuilder
+ ProcessBuilder pb = new ProcessBuilder(command);
+ // Set working directory to project root (where python_api is located)
+ pb.directory(new File("."));
+
+ // Set environment variables for offline mode
+ Map env = pb.environment();
+ env.put("PYTHONIOENCODING", "utf-8");
+ env.put("PYTHONUNBUFFERED", "1");
+ // Note: Don't set PYTHONNOUSERSITE=1 as it blocks loading system-installed Flask
+
+ // Use bundled libraries if available (Linux/Windows)
+ Path libPath = resolveEmbeddedSitePackages();
+ if (libPath != null) {
+ env.put("PYTHONPATH", libPath.toString());
+ }
+ // Prefer embedded Python home when using standalone runtime
+ Path pythonHome = resolveEmbeddedPythonHome();
+ if (pythonHome != null) {
+ env.put("PYTHONHOME", pythonHome.toString());
+ }
+
+ // Use bundled models
+ Path modelsPath = Paths.get(modelsDir).toAbsolutePath();
+ env.put("PADDLEOCR_HOME", modelsPath.toString());
+ env.put("HUB_HOME", modelsPath.toString());
+ if (disableModelSourceCheck) {
+ env.put("PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK", "True");
+ }
+
+ // Flask configuration
+ env.put("PORT", String.valueOf(flaskPort));
+ env.put("HOST", flaskHost);
+
+ pb.redirectErrorStream(true);
+
+ log.info("Starting Flask with embedded Python: {}", pythonExecutable);
+ log.info("Flask will listen on: http://{}:{}", flaskHost, flaskPort);
+
+ // Start process
+ flaskProcess = pb.start();
+ startFlaskLogReader();
+ waitForFlaskReady();
+
+ if (flaskReady) {
+ log.info("✓ Flask OCR API server started successfully (offline mode)");
+ } else {
+ log.warn("Flask OCR API server not ready yet; keeping process running for late readiness");
+ }
+
+ } catch (Exception e) {
+ log.error("Failed to start Flask process (offline mode)", e);
+ }
+ }
+
+ /**
+ * Start thread to read Flask logs
+ */
+ private void startFlaskLogReader() {
+ Thread logReader = new Thread(() -> {
+ try (BufferedReader reader = new BufferedReader(
+ new InputStreamReader(flaskProcess.getInputStream()))) {
+
+ String line;
+ while ((line = reader.readLine()) != null) {
+ log.info("[Flask] {}", line);
+ }
+
+ } catch (IOException e) {
+ if (flaskProcess.isAlive()) {
+ log.warn("Error reading Flask logs", e);
+ }
+ }
+ });
+
+ logReader.setDaemon(true);
+ logReader.setName("Flask-Log-Reader");
+ logReader.start();
+ }
+
+ /**
+ * Wait for Flask to be ready by polling health endpoint
+ */
+ private void waitForFlaskReady() {
+ log.info("Waiting for Flask to be ready (timeout: {}s)", startupTimeoutSeconds);
+
+ String healthUrl = String.format("http://%s:%d/health", flaskHost, flaskPort);
+ long startTime = System.currentTimeMillis();
+ long timeoutMillis = TimeUnit.SECONDS.toMillis(startupTimeoutSeconds);
+
+ while (System.currentTimeMillis() - startTime < timeoutMillis) {
+ if (!flaskProcess.isAlive()) {
+ log.error("Flask process terminated unexpectedly");
+ return;
+ }
+
+ try {
+ HttpURLConnection conn = (HttpURLConnection) new URL(healthUrl).openConnection();
+ conn.setRequestMethod("GET");
+ conn.setConnectTimeout(2000);
+ conn.setReadTimeout(2000);
+
+ int responseCode = conn.getResponseCode();
+ if (responseCode == 200) {
+ log.info("Flask health check passed");
+ flaskReady = true;
+ return;
+ }
+
+ } catch (IOException e) {
+ // Not ready yet, continue waiting
+ log.debug("Flask not ready yet: {}", e.getMessage());
+ }
+
+ try {
+ TimeUnit.SECONDS.sleep(2);
+ } catch (InterruptedException ie) {
+ Thread.currentThread().interrupt();
+ return;
+ }
+ }
+
+ log.warn("Flask health check timeout after {}s", startupTimeoutSeconds);
+ }
+
+ /**
+ * Stop Flask process gracefully
+ */
+ @PreDestroy
+ public void stopFlaskProcess() {
+ if (flaskProcess != null && flaskProcess.isAlive()) {
+ log.info("Stopping Flask OCR API server...");
+
+ // Try graceful shutdown first
+ flaskProcess.destroy();
+
+ try {
+ // Wait up to 10 seconds for process to terminate
+ if (!flaskProcess.waitFor(10, TimeUnit.SECONDS)) {
+ log.warn("Flask did not stop gracefully, forcing termination");
+ flaskProcess.destroyForcibly();
+ }
+
+ log.info("Flask OCR API server stopped");
+
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ log.error("Interrupted while waiting for Flask to stop", e);
+ }
+
+ flaskReady = false;
+ }
+ }
+
+ /**
+ * Check if Flask is ready
+ */
+ public boolean isFlaskReady() {
+ return flaskReady && flaskProcess != null && flaskProcess.isAlive();
+ }
+
+ /**
+ * Get Flask base URL
+ */
+ public String getFlaskBaseUrl() {
+ return String.format("http://%s:%d", flaskHost, flaskPort);
+ }
+
+ /**
+ * Restart Flask process
+ */
+ public void restartFlask() {
+ log.info("Restarting Flask OCR API server...");
+ stopFlaskProcess();
+
+ try {
+ TimeUnit.SECONDS.sleep(2);
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ }
+
+ startFlaskProcess();
+ }
+
+ public synchronized boolean ensureFlaskRunning() {
+ if (!flaskEnabled) {
+ log.info("Flask process management is disabled");
+ return false;
+ }
+ if (flaskProcess != null && flaskProcess.isAlive() && flaskReady) {
+ return true;
+ }
+ if (flaskProcess != null && flaskProcess.isAlive() && !flaskReady) {
+ waitForFlaskReady();
+ return flaskReady;
+ }
+ startFlaskProcess();
+ return flaskReady;
+ }
+
+ private Path resolveEmbeddedPython() {
+ // Prefer embedded standalone runtime first
+ Path linuxPython310 = Paths.get(resourceDir, "python-runtime/python/bin/python3.10");
+ if (Files.exists(linuxPython310)) {
+ return linuxPython310;
+ }
+ Path linuxPython3 = Paths.get(resourceDir, "python-runtime/python/bin/python3");
+ if (Files.exists(linuxPython3)) {
+ return linuxPython3;
+ }
+ Path linuxPython = Paths.get(resourceDir, "python-runtime/python/bin/python");
+ if (Files.exists(linuxPython)) {
+ return linuxPython;
+ }
+ // Fallback to venv interpreter if bundled (still embedded)
+ linuxPython310 = Paths.get(resourceDir, "python-runtime/venv-offline/bin/python3.10");
+ if (Files.exists(linuxPython310)) {
+ return linuxPython310;
+ }
+ Path linuxPython311 = Paths.get(resourceDir, "python-runtime/venv-offline/bin/python3.11");
+ if (Files.exists(linuxPython311)) {
+ return linuxPython311;
+ }
+ Path linuxPython39 = Paths.get(resourceDir, "python-runtime/venv-offline/bin/python3.9");
+ if (Files.exists(linuxPython39)) {
+ return linuxPython39;
+ }
+ Path linuxPython38 = Paths.get(resourceDir, "python-runtime/venv-offline/bin/python3.8");
+ if (Files.exists(linuxPython38)) {
+ return linuxPython38;
+ }
+ Path linuxPythonVenv = Paths.get(resourceDir, "python-runtime/venv-offline/bin/python3");
+ if (Files.exists(linuxPythonVenv)) {
+ return linuxPythonVenv;
+ }
+ Path linuxPythonAlt = Paths.get(resourceDir, "python-runtime/venv-offline/bin/python");
+ if (Files.exists(linuxPythonAlt)) {
+ return linuxPythonAlt;
+ }
+ Path windowsPython = Paths.get(resourceDir, "python-runtime/venv-offline/Scripts/python.exe");
+ if (Files.exists(windowsPython)) {
+ return windowsPython;
+ }
+ Path windowsPythonAlt = Paths.get(resourceDir, "python-runtime/python.exe");
+ if (Files.exists(windowsPythonAlt)) {
+ return windowsPythonAlt;
+ }
+ return null;
+ }
+
+ private Path resolveEmbeddedPythonHome() {
+ Path pythonHome = Paths.get(resourceDir, "python-runtime/python");
+ if (Files.isDirectory(pythonHome)) {
+ return pythonHome;
+ }
+ return null;
+ }
+
+ private Path resolveEmbeddedSitePackages() {
+ // Linux venvs use lib/pythonX.Y/site-packages
+ Path libRoot = Paths.get(resourceDir, "python-runtime/venv-offline/lib");
+ if (Files.isDirectory(libRoot)) {
+ try {
+ try (java.nio.file.DirectoryStream stream = Files.newDirectoryStream(libRoot)) {
+ for (Path p : stream) {
+ if (Files.isDirectory(p) && p.getFileName().toString().startsWith("python")) {
+ Path site = p.resolve("site-packages");
+ if (Files.exists(site)) {
+ return site;
+ }
+ }
+ }
+ }
+ } catch (IOException e) {
+ log.warn("Failed to scan embedded site-packages: {}", e.getMessage());
+ }
+ }
+ // Windows venvs
+ Path venvLibWin = Paths.get(resourceDir, "python-runtime/venv-offline/Lib/site-packages");
+ if (Files.exists(venvLibWin)) {
+ return venvLibWin;
+ }
+ return null;
+ }
+}