From 9064d3ea105b3f3e5334fa63b294f95a374598c0 Mon Sep 17 00:00:00 2001 From: huangrh Date: Thu, 19 Mar 2026 14:18:14 +0800 Subject: [PATCH] Package embedded Python archives and enforce embedded runtime --- pom.xml | 45 +- requirements-offline.txt | 6 +- .../common/utils/ResourceExtractor.java | 471 ++++++++++++++++++ .../ocr/engine/FlaskProcessManager.java | 424 ++++++++++++++++ 4 files changed, 910 insertions(+), 36 deletions(-) create mode 100644 src/main/java/com/chinaweal/youfool/reportdetect/common/utils/ResourceExtractor.java create mode 100644 src/main/java/com/chinaweal/youfool/reportdetect/modules/ocr/engine/FlaskProcessManager.java diff --git a/pom.xml b/pom.xml index 8b17884..85979f9 100644 --- a/pom.xml +++ b/pom.xml @@ -171,6 +171,11 @@ jsoup 1.17.2 + + org.apache.commons + commons-compress + 1.26.1 + @@ -247,9 +252,9 @@ - + - package-python-static + package-python-archives process-resources copy-resources @@ -258,37 +263,11 @@ ${project.build.directory}/classes/python-runtime - packaging/python/python-3.9-linux-static - - **/*.pyc - **/__pycache__/** - **/test/** - **/tests/** - - - - - - - - package-python-venv - process-resources - - copy-resources - - - ${project.build.directory}/classes/python-runtime/venv-offline - - - packaging/python/venv-linux-offline - - **/*.pyc - **/__pycache__/** - **/tests/** - **/test/** - **/*.md - **/*.dist-info/** - + packaging/python + + python-runtime.tar.gz + venv-offline.tar.gz + diff --git a/requirements-offline.txt b/requirements-offline.txt index 588a116..f21ef50 100644 --- a/requirements-offline.txt +++ b/requirements-offline.txt @@ -1,11 +1,11 @@ # Offline Python Requirements - Fixed Versions for Reproducibility # Generated: 2026-03-04 -# Target: Linux x86_64, Python 3.9 +# Target: Linux x86_64, Python 3.10 # Core OCR Dependencies paddleocr==2.7.5 -paddlepaddle==2.5.2 -opencv-python==4.8.0.76 +paddlepaddle==2.6.2 +opencv-python==4.6.0.66 pymupdf==1.23.0 pikepdf==8.0.0 diff --git a/src/main/java/com/chinaweal/youfool/reportdetect/common/utils/ResourceExtractor.java b/src/main/java/com/chinaweal/youfool/reportdetect/common/utils/ResourceExtractor.java new file mode 100644 index 0000000..0b14894 --- /dev/null +++ b/src/main/java/com/chinaweal/youfool/reportdetect/common/utils/ResourceExtractor.java @@ -0,0 +1,471 @@ +package com.chinaweal.youfool.reportdetect.common.utils; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.core.io.Resource; +import org.springframework.core.io.support.PathMatchingResourcePatternResolver; +import org.springframework.core.io.support.ResourcePatternResolver; +import org.springframework.stereotype.Component; + +import javax.annotation.PostConstruct; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.*; +import java.nio.file.attribute.BasicFileAttributes; +import java.util.Enumeration; +import java.util.HashMap; +import java.util.Map; +import java.util.jar.JarEntry; +import java.util.jar.JarFile; +import java.util.zip.GZIPInputStream; + +import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; + +/** + * Extracts Python resources and model files from JAR to working directory + */ +@Component +public class ResourceExtractor { + + private static final Logger log = LoggerFactory.getLogger(ResourceExtractor.class); + + @Value("${app.ocr.resource-dir:./ocr-resources}") + private String resourceDir; + + @Value("${app.ocr.models-dir:./models}") + private String modelsDir; + + @Value("${app.ocr.extract-on-startup:true}") + private boolean extractOnStartup; + + @PostConstruct + public void init() { + if (extractOnStartup) { + try { + extractIfMissing(); + } catch (IOException e) { + log.error("Failed to extract resources on startup", e); + } + } + } + + /** + * Extract Python code and models if they don't exist + */ + public void extractIfMissing() throws IOException { + extractPythonCode(); + extractModels(); + } + + /** + * Extract all offline resources (Python runtime, venv, models, scripts) + * Used for complete offline deployment + */ + public void extractAllResources() throws IOException { + log.info("========================================"); + log.info("开始提取离线资源"); + log.info("========================================"); + + long startTime = System.currentTimeMillis(); + + // 1. Extract Python runtime + extractPythonRuntime(); + + // 2. Extract virtual environment + extractVirtualEnvironment(); + + // 3. Extract OCR models + extractOcrModels(); + + // 4. Extract Python scripts + extractPythonCode(); + + long duration = System.currentTimeMillis() - startTime; + log.info("========================================"); + log.info("✓ 离线资源提取完成 (耗时: {}秒)", duration / 1000.0); + log.info("========================================"); + } + + /** + * Extract Python API code from JAR to working directory + */ + private void extractPythonCode() throws IOException { + Path targetDir = Paths.get(resourceDir); + + if (Files.exists(targetDir) && + Files.list(targetDir).findAny().isPresent()) { + log.info("Python resources already exist at {}", targetDir); + return; + } + + log.info("Extracting Python resources to {}", targetDir); + Files.createDirectories(targetDir); + + // List of Python resources to extract + Map pythonResources = new HashMap<>(); + pythonResources.put("python_api/ocr_api_server.py", "ocr_api_server.py"); + pythonResources.put("python_api/ocr_task_consumer.py", "ocr_task_consumer.py"); + pythonResources.put("src/main/python/pdf_processor.py", "pdf_processor.py"); + pythonResources.put("test_accuracy_batch_full.py", "test_accuracy_batch_full.py"); + + for (Map.Entry entry : pythonResources.entrySet()) { + String sourcePath = entry.getKey(); + String targetName = entry.getValue(); + extractResource(sourcePath, targetDir.resolve(targetName)); + } + + log.info("Python resources extracted successfully"); + } + + /** + * Extract model files from JAR to working directory + * NOTE: This is a placeholder - actual model extraction depends on how models are packaged + * For large models (3-5GB), consider: + * 1. External model download on first run + * 2. Separate model package + * 3. Docker volume mount + */ + private void extractModels() throws IOException { + Path targetDir = Paths.get(modelsDir); + + if (Files.exists(targetDir)) { + // Check if essential models exist + Path[] essentialModels = { + targetDir.resolve("pp-ocrv5/PP-OCRv5_server_det_onnx/inference.onnx"), + targetDir.resolve("pp-ocrv5/PP-OCRv5_server_rec_onnx/inference.onnx") + }; + + boolean allExist = true; + for (Path model : essentialModels) { + if (!Files.exists(model)) { + allExist = false; + break; + } + } + + if (allExist) { + log.info("Models directory already exists with essential models at {}", targetDir); + return; + } + } + + log.warn("Models directory missing or incomplete at {}", targetDir); + log.info("Models will be auto-downloaded by PaddleOCR on first use"); + + // Create directory structure + Files.createDirectories(targetDir); + + // NOTE: For production, implement one of these strategies: + // + // Strategy 1: Download from CDN + // downloadModelsFromCdn(targetDir); + // + // Strategy 2: Extract from JAR (if bundled) + // extractModelsFromJar(targetDir); + // + // Strategy 3: Docker volume mount (recommended) + // Models are mounted as volume at runtime + } + + /** + * Extract a single resource from classpath to target path + */ + private void extractResource(String resourcePath, Path targetPath) throws IOException { + InputStream is = getClass().getClassLoader().getResourceAsStream(resourcePath); + + if (is == null) { + log.warn("Resource not found in classpath: {}", resourcePath); + + // Try extracting from JAR directly + if (isRunningInJar()) { + extractFromJar(resourcePath, targetPath); + } + return; + } + + Files.copy(is, targetPath, StandardCopyOption.REPLACE_EXISTING); + is.close(); + log.debug("Extracted: {} -> {}", resourcePath, targetPath); + } + + /** + * Extract file from JAR (for resources not found via classpath) + */ + private void extractFromJar(String resourcePath, Path targetPath) throws IOException { + String jarPath = getClass().getProtectionDomain().getCodeSource().getLocation().getPath(); + + if (jarPath == null || !jarPath.endsWith(".jar")) { + return; + } + + try (JarFile jarFile = new JarFile(jarPath)) { + Enumeration entries = jarFile.entries(); + + while (entries.hasMoreElements()) { + JarEntry entry = entries.nextElement(); + String entryName = entry.getName(); + + if (entryName.equals(resourcePath) || entryName.endsWith("/" + resourcePath)) { + try (InputStream is = jarFile.getInputStream(entry)) { + Files.createDirectories(targetPath.getParent()); + Files.copy(is, targetPath, StandardCopyOption.REPLACE_EXISTING); + log.debug("Extracted from JAR: {} -> {}", entryName, targetPath); + return; + } + } + } + } + + log.warn("Resource not found in JAR: {}", resourcePath); + } + + /** + * Check if application is running from JAR + */ + private boolean isRunningInJar() { + String protocol = getClass().getProtectionDomain().getCodeSource().getLocation().getProtocol(); + return "jar".equals(protocol); + } + + /** + * Recursively delete a directory + */ + public void deleteDirectory(Path directory) throws IOException { + if (Files.exists(directory)) { + Files.walkFileTree(directory, new SimpleFileVisitor() { + @Override + public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { + Files.delete(file); + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException { + Files.delete(dir); + return FileVisitResult.CONTINUE; + } + }); + } + } + + public String getResourceDir() { + return resourceDir; + } + + public String getModelsDir() { + return modelsDir; + } + + public boolean isExtractOnStartup() { + return extractOnStartup; + } + + // ========== Offline Deployment Methods ========== + + private void extractPythonRuntime() throws IOException { + Path targetDir = Paths.get(resourceDir, "python-runtime"); + + if (Files.exists(targetDir.resolve("python/bin/python3.10")) || + Files.exists(targetDir.resolve("python/bin/python3"))) { + log.info("[1/4] Python runtime already exists, skipping"); + return; + } + + log.info("[1/4] Extracting Python runtime..."); + Files.createDirectories(targetDir); + if (!extractTarGzResource("python-runtime/python-runtime.tar.gz", targetDir)) { + copyDirectoryFromClasspath("python-runtime", targetDir); + } + makeExecutable(targetDir.resolve("python/bin/python3.10")); + makeExecutable(targetDir.resolve("python/bin/python3")); + log.info(" Done"); + } + + private void extractVirtualEnvironment() throws IOException { + Path targetDir = Paths.get(resourceDir, "python-runtime", "venv-offline"); + + if (Files.exists(targetDir.resolve("bin/python3.10")) || + Files.exists(targetDir.resolve("bin/python3"))) { + log.info("[2/4] Python venv already exists, skipping"); + return; + } + + log.info("[2/4] Extracting Python venv..."); + Files.createDirectories(targetDir.getParent()); + if (!extractTarGzResource("python-runtime/venv-offline.tar.gz", targetDir.getParent())) { + Files.createDirectories(targetDir); + copyDirectoryFromClasspath("python-runtime/venv-offline", targetDir); + } + makeExecutable(targetDir.resolve("bin/python3.10")); + makeExecutable(targetDir.resolve("bin/python3")); + log.info(" Done"); + } + + private void extractOcrModels() throws IOException { + Path targetDir = Paths.get(modelsDir); + + if (Files.exists(targetDir.resolve("pp-ocrv5/det_model/inference.onnx"))) { + log.info("[3/4] OCR模型已存在,跳过"); + return; + } + + log.info("[3/4] 提取OCR模型..."); + Files.createDirectories(targetDir); + copyDirectoryFromClasspath("models", targetDir); + log.info(" ✓ 完成"); + } + + private void copyDirectoryFromClasspath(String resourcePath, Path targetDir) throws IOException { + log.info(" 复制资源: {}", resourcePath); + + ResourcePatternResolver resolver = new PathMatchingResourcePatternResolver(); + Resource[] resources = resolver.getResources("classpath*:" + resourcePath + "/**"); + + int fileCount = 0; + for (Resource resource : resources) { + if (resource.isReadable() && !resource.getURL().toString().endsWith("/")) { + try { + String path = resource.getURL().getPath(); + String relativePath = path.substring(path.indexOf(resourcePath)); + Path targetPath = targetDir.resolve(relativePath.substring(resourcePath.length())); + + Files.createDirectories(targetPath.getParent()); + Files.copy(resource.getInputStream(), targetPath, + StandardCopyOption.REPLACE_EXISTING); + fileCount++; + } catch (Exception e) { + log.debug("跳过文件: {}", resource.getFilename()); + } + } + } + log.info(" 复制了 {} 个文件", fileCount); + } + + /** + * Extract a .tar.gz resource from classpath into target directory. + * Returns true if extracted, false if resource not found. + */ + private boolean extractTarGzResource(String resourceName, Path targetDir) throws IOException { + ResourcePatternResolver resolver = new PathMatchingResourcePatternResolver(); + Resource resource = resolver.getResource("classpath:" + resourceName); + if (!resource.exists()) { + return false; + } + + log.info(" Extracting archive: {}", resourceName); + Files.createDirectories(targetDir); + + // Read to a temp file first to avoid nested-jar stream issues + Path tmp = Files.createTempFile("ocr-archive-", ".tar.gz"); + try (InputStream is = resource.getInputStream()) { + Files.copy(is, tmp, StandardCopyOption.REPLACE_EXISTING); + } + + long tmpSize = Files.size(tmp); + if (tmpSize <= 0) { + log.warn(" Archive size is 0 bytes: {}", resourceName); + try { + Files.deleteIfExists(tmp); + } catch (IOException ignore) { + } + return false; + } + log.info(" Archive temp file: {} ({} bytes)", tmp.toAbsolutePath(), tmpSize); + + // Prefer system tar on Linux for reliability + if (isLinux()) { + try { + ProcessBuilder pb = new ProcessBuilder( + "tar", "-xzf", tmp.toAbsolutePath().toString(), "-C", targetDir.toAbsolutePath().toString()); + pb.redirectErrorStream(true); + Process p = pb.start(); + StringBuilder out = new StringBuilder(); + try (java.io.BufferedReader br = new java.io.BufferedReader( + new java.io.InputStreamReader(p.getInputStream()))) { + String line; + while ((line = br.readLine()) != null) { + out.append(line).append('\n'); + } + } + int code = p.waitFor(); + if (code == 0) { + log.info(" System tar extraction succeeded: {}", resourceName); + try { + Files.deleteIfExists(tmp); + } catch (IOException ignore) { + } + return true; + } else { + log.warn(" System tar extraction failed (code {}): {}", code, resourceName); + if (out.length() > 0) { + log.warn(" System tar output: {}", out.toString().trim()); + } + } + } catch (Exception e) { + log.warn(" System tar extraction error: {}", e.getMessage()); + } + } + + int extracted = 0; + try (InputStream fis = Files.newInputStream(tmp); + GZIPInputStream gis = new GZIPInputStream(fis); + TarArchiveInputStream tis = new TarArchiveInputStream(gis)) { + + TarArchiveEntry entry; + while ((entry = tis.getNextTarEntry()) != null) { + String entryName = entry.getName(); + if (entryName == null || entryName.isEmpty()) { + continue; + } + Path outPath = targetDir.resolve(entryName).normalize(); + if (!outPath.startsWith(targetDir)) { + continue; + } + if (entry.isDirectory()) { + Files.createDirectories(outPath); + continue; + } + if (entry.isSymbolicLink() || entry.isLink()) { + // Skip symlinks to keep Windows compatibility + continue; + } + Files.createDirectories(outPath.getParent()); + Files.copy(tis, outPath, StandardCopyOption.REPLACE_EXISTING); + extracted++; + } + } + if (extracted == 0) { + log.warn(" Archive extracted 0 files: {}", resourceName); + } else { + log.info(" Extracted {} files from {}", extracted, resourceName); + } + try { + Files.deleteIfExists(tmp); + } catch (IOException ignore) { + } + return true; + } + + private boolean isLinux() { + String os = System.getProperty("os.name").toLowerCase(); + return os.contains("linux"); + } + + private void makeExecutable(Path file) { + if (Files.exists(file)) { + try { + file.toFile().setExecutable(true); + // Also try chmod on Linux/Unix + if (System.getProperty("os.name").toLowerCase().contains("linux") || + System.getProperty("os.name").toLowerCase().contains("unix") || + System.getProperty("os.name").toLowerCase().contains("mac")) { + Runtime.getRuntime().exec(new String[]{"chmod", "+x", file.toString()}); + } + } catch (Exception e) { + log.warn("无法设置可执行权限: {}", file); + } + } + } +} diff --git a/src/main/java/com/chinaweal/youfool/reportdetect/modules/ocr/engine/FlaskProcessManager.java b/src/main/java/com/chinaweal/youfool/reportdetect/modules/ocr/engine/FlaskProcessManager.java new file mode 100644 index 0000000..7bd12e0 --- /dev/null +++ b/src/main/java/com/chinaweal/youfool/reportdetect/modules/ocr/engine/FlaskProcessManager.java @@ -0,0 +1,424 @@ +package com.chinaweal.youfool.reportdetect.modules.ocr.engine; + +import com.chinaweal.youfool.reportdetect.common.utils.ResourceExtractor; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.context.ApplicationListener; +import org.springframework.context.event.ContextRefreshedEvent; +import org.springframework.stereotype.Component; + +import javax.annotation.PreDestroy; +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.HttpURLConnection; +import java.net.URL; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; + +/** + * Manages Flask OCR API process lifecycle + * - Auto-starts Flask on application startup + * - Monitors Flask health + * - Gracefully shuts down on application exit + */ +@Component +public class FlaskProcessManager implements ApplicationListener { + + private static final Logger log = LoggerFactory.getLogger(FlaskProcessManager.class); + + @Value("${app.ocr.flask.enabled:true}") + private boolean flaskEnabled; + + @Value("${app.ocr.flask.host:127.0.0.1}") + private String flaskHost; + + @Value("${app.ocr.flask.port:8081}") + private int flaskPort; + + @Value("${app.ocr.flask.startup-timeout:60}") + private int startupTimeoutSeconds; + + @Value("${app.ocr.flask.disable-model-source-check:true}") + private boolean disableModelSourceCheck; + + @Value("${app.ocr.python-command:python}") + private String pythonCommand; + + @Value("${app.ocr.resource-dir:./ocr-resources}") + private String resourceDir; + + @Value("${app.ocr.models-dir:./models}") + private String modelsDir; + + @Autowired + private ResourceExtractor resourceExtractor; + + private Process flaskProcess; + private boolean flaskReady = false; + + @Override + public void onApplicationEvent(ContextRefreshedEvent event) { + if (flaskEnabled) { + startFlaskProcess(); + } else { + log.info("Flask process management is disabled"); + } + } + + /** + * Start Flask API server with embedded Python (offline mode) + */ + public synchronized void startFlaskProcess() { + if (!flaskEnabled) { + log.info("Flask process management is disabled"); + return; + } + if (flaskProcess != null && flaskProcess.isAlive()) { + log.info("Flask OCR API server is already running"); + return; + } + log.info("Starting Flask OCR API server..."); + + try { + // Extract all resources for offline mode + resourceExtractor.extractAllResources(); + + // Determine Python executable and Flask API script path + String pythonExecutable; + Path apiScriptPath; + + // Require embedded Python (Linux + Windows layouts) + Path embeddedPython = resolveEmbeddedPython(); + if (embeddedPython != null) { + pythonExecutable = embeddedPython.toString(); + log.info("Using embedded Python: {}", pythonExecutable); + } else { + log.error("Embedded Python not found. Refusing to start Flask with system Python."); + log.error("Expected embedded runtime under: {}/python-runtime/python", resourceDir); + return; + } + + // Try multiple locations for Flask API script + // Priority 1: Project's python_api directory (development) + apiScriptPath = Paths.get("./python_api/ocr_api_server.py"); + if (Files.exists(apiScriptPath)) { + log.info("Found Flask API script at: {}", apiScriptPath); + } else { + // Priority 2: Embedded in ocr-resources (offline deployment) + apiScriptPath = Paths.get(resourceDir, "python-api/ocr_api_server.py"); + if (Files.exists(apiScriptPath)) { + log.info("Found Flask API script at: {}", apiScriptPath); + } else { + // Priority 3: Root of ocr-resources + apiScriptPath = Paths.get(resourceDir, "ocr_api_server.py"); + if (Files.exists(apiScriptPath)) { + log.info("Found Flask API script at: {}", apiScriptPath); + } else { + log.error("Flask API script not found at any of the following locations:"); + log.error(" 1. ./python_api/ocr_api_server.py"); + log.error(" 2. {}/python-api/ocr_api_server.py", resourceDir); + log.error(" 3. {}/ocr_api_server.py", resourceDir); + return; + } + } + } + + // Build command + List command = new ArrayList<>(); + command.add(pythonExecutable); + command.add(apiScriptPath.toAbsolutePath().toString()); + + // Configure ProcessBuilder + ProcessBuilder pb = new ProcessBuilder(command); + // Set working directory to project root (where python_api is located) + pb.directory(new File(".")); + + // Set environment variables for offline mode + Map env = pb.environment(); + env.put("PYTHONIOENCODING", "utf-8"); + env.put("PYTHONUNBUFFERED", "1"); + // Note: Don't set PYTHONNOUSERSITE=1 as it blocks loading system-installed Flask + + // Use bundled libraries if available (Linux/Windows) + Path libPath = resolveEmbeddedSitePackages(); + if (libPath != null) { + env.put("PYTHONPATH", libPath.toString()); + } + // Prefer embedded Python home when using standalone runtime + Path pythonHome = resolveEmbeddedPythonHome(); + if (pythonHome != null) { + env.put("PYTHONHOME", pythonHome.toString()); + } + + // Use bundled models + Path modelsPath = Paths.get(modelsDir).toAbsolutePath(); + env.put("PADDLEOCR_HOME", modelsPath.toString()); + env.put("HUB_HOME", modelsPath.toString()); + if (disableModelSourceCheck) { + env.put("PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK", "True"); + } + + // Flask configuration + env.put("PORT", String.valueOf(flaskPort)); + env.put("HOST", flaskHost); + + pb.redirectErrorStream(true); + + log.info("Starting Flask with embedded Python: {}", pythonExecutable); + log.info("Flask will listen on: http://{}:{}", flaskHost, flaskPort); + + // Start process + flaskProcess = pb.start(); + startFlaskLogReader(); + waitForFlaskReady(); + + if (flaskReady) { + log.info("✓ Flask OCR API server started successfully (offline mode)"); + } else { + log.warn("Flask OCR API server not ready yet; keeping process running for late readiness"); + } + + } catch (Exception e) { + log.error("Failed to start Flask process (offline mode)", e); + } + } + + /** + * Start thread to read Flask logs + */ + private void startFlaskLogReader() { + Thread logReader = new Thread(() -> { + try (BufferedReader reader = new BufferedReader( + new InputStreamReader(flaskProcess.getInputStream()))) { + + String line; + while ((line = reader.readLine()) != null) { + log.info("[Flask] {}", line); + } + + } catch (IOException e) { + if (flaskProcess.isAlive()) { + log.warn("Error reading Flask logs", e); + } + } + }); + + logReader.setDaemon(true); + logReader.setName("Flask-Log-Reader"); + logReader.start(); + } + + /** + * Wait for Flask to be ready by polling health endpoint + */ + private void waitForFlaskReady() { + log.info("Waiting for Flask to be ready (timeout: {}s)", startupTimeoutSeconds); + + String healthUrl = String.format("http://%s:%d/health", flaskHost, flaskPort); + long startTime = System.currentTimeMillis(); + long timeoutMillis = TimeUnit.SECONDS.toMillis(startupTimeoutSeconds); + + while (System.currentTimeMillis() - startTime < timeoutMillis) { + if (!flaskProcess.isAlive()) { + log.error("Flask process terminated unexpectedly"); + return; + } + + try { + HttpURLConnection conn = (HttpURLConnection) new URL(healthUrl).openConnection(); + conn.setRequestMethod("GET"); + conn.setConnectTimeout(2000); + conn.setReadTimeout(2000); + + int responseCode = conn.getResponseCode(); + if (responseCode == 200) { + log.info("Flask health check passed"); + flaskReady = true; + return; + } + + } catch (IOException e) { + // Not ready yet, continue waiting + log.debug("Flask not ready yet: {}", e.getMessage()); + } + + try { + TimeUnit.SECONDS.sleep(2); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + return; + } + } + + log.warn("Flask health check timeout after {}s", startupTimeoutSeconds); + } + + /** + * Stop Flask process gracefully + */ + @PreDestroy + public void stopFlaskProcess() { + if (flaskProcess != null && flaskProcess.isAlive()) { + log.info("Stopping Flask OCR API server..."); + + // Try graceful shutdown first + flaskProcess.destroy(); + + try { + // Wait up to 10 seconds for process to terminate + if (!flaskProcess.waitFor(10, TimeUnit.SECONDS)) { + log.warn("Flask did not stop gracefully, forcing termination"); + flaskProcess.destroyForcibly(); + } + + log.info("Flask OCR API server stopped"); + + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + log.error("Interrupted while waiting for Flask to stop", e); + } + + flaskReady = false; + } + } + + /** + * Check if Flask is ready + */ + public boolean isFlaskReady() { + return flaskReady && flaskProcess != null && flaskProcess.isAlive(); + } + + /** + * Get Flask base URL + */ + public String getFlaskBaseUrl() { + return String.format("http://%s:%d", flaskHost, flaskPort); + } + + /** + * Restart Flask process + */ + public void restartFlask() { + log.info("Restarting Flask OCR API server..."); + stopFlaskProcess(); + + try { + TimeUnit.SECONDS.sleep(2); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + + startFlaskProcess(); + } + + public synchronized boolean ensureFlaskRunning() { + if (!flaskEnabled) { + log.info("Flask process management is disabled"); + return false; + } + if (flaskProcess != null && flaskProcess.isAlive() && flaskReady) { + return true; + } + if (flaskProcess != null && flaskProcess.isAlive() && !flaskReady) { + waitForFlaskReady(); + return flaskReady; + } + startFlaskProcess(); + return flaskReady; + } + + private Path resolveEmbeddedPython() { + // Prefer embedded standalone runtime first + Path linuxPython310 = Paths.get(resourceDir, "python-runtime/python/bin/python3.10"); + if (Files.exists(linuxPython310)) { + return linuxPython310; + } + Path linuxPython3 = Paths.get(resourceDir, "python-runtime/python/bin/python3"); + if (Files.exists(linuxPython3)) { + return linuxPython3; + } + Path linuxPython = Paths.get(resourceDir, "python-runtime/python/bin/python"); + if (Files.exists(linuxPython)) { + return linuxPython; + } + // Fallback to venv interpreter if bundled (still embedded) + linuxPython310 = Paths.get(resourceDir, "python-runtime/venv-offline/bin/python3.10"); + if (Files.exists(linuxPython310)) { + return linuxPython310; + } + Path linuxPython311 = Paths.get(resourceDir, "python-runtime/venv-offline/bin/python3.11"); + if (Files.exists(linuxPython311)) { + return linuxPython311; + } + Path linuxPython39 = Paths.get(resourceDir, "python-runtime/venv-offline/bin/python3.9"); + if (Files.exists(linuxPython39)) { + return linuxPython39; + } + Path linuxPython38 = Paths.get(resourceDir, "python-runtime/venv-offline/bin/python3.8"); + if (Files.exists(linuxPython38)) { + return linuxPython38; + } + Path linuxPythonVenv = Paths.get(resourceDir, "python-runtime/venv-offline/bin/python3"); + if (Files.exists(linuxPythonVenv)) { + return linuxPythonVenv; + } + Path linuxPythonAlt = Paths.get(resourceDir, "python-runtime/venv-offline/bin/python"); + if (Files.exists(linuxPythonAlt)) { + return linuxPythonAlt; + } + Path windowsPython = Paths.get(resourceDir, "python-runtime/venv-offline/Scripts/python.exe"); + if (Files.exists(windowsPython)) { + return windowsPython; + } + Path windowsPythonAlt = Paths.get(resourceDir, "python-runtime/python.exe"); + if (Files.exists(windowsPythonAlt)) { + return windowsPythonAlt; + } + return null; + } + + private Path resolveEmbeddedPythonHome() { + Path pythonHome = Paths.get(resourceDir, "python-runtime/python"); + if (Files.isDirectory(pythonHome)) { + return pythonHome; + } + return null; + } + + private Path resolveEmbeddedSitePackages() { + // Linux venvs use lib/pythonX.Y/site-packages + Path libRoot = Paths.get(resourceDir, "python-runtime/venv-offline/lib"); + if (Files.isDirectory(libRoot)) { + try { + try (java.nio.file.DirectoryStream stream = Files.newDirectoryStream(libRoot)) { + for (Path p : stream) { + if (Files.isDirectory(p) && p.getFileName().toString().startsWith("python")) { + Path site = p.resolve("site-packages"); + if (Files.exists(site)) { + return site; + } + } + } + } + } catch (IOException e) { + log.warn("Failed to scan embedded site-packages: {}", e.getMessage()); + } + } + // Windows venvs + Path venvLibWin = Paths.get(resourceDir, "python-runtime/venv-offline/Lib/site-packages"); + if (Files.exists(venvLibWin)) { + return venvLibWin; + } + return null; + } +}