Package embedded Python archives and enforce embedded runtime
This commit is contained in:
parent
fc9cbcf1da
commit
9064d3ea10
45
pom.xml
45
pom.xml
|
|
@ -171,6 +171,11 @@
|
|||
<artifactId>jsoup</artifactId>
|
||||
<version>1.17.2</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-compress</artifactId>
|
||||
<version>1.26.1</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
|
|
@ -247,9 +252,9 @@
|
|||
</resources>
|
||||
</configuration>
|
||||
</execution>
|
||||
<!-- Package Python runtime for offline deployment -->
|
||||
<!-- Package Python runtime + venv archives for offline deployment (Windows-safe) -->
|
||||
<execution>
|
||||
<id>package-python-static</id>
|
||||
<id>package-python-archives</id>
|
||||
<phase>process-resources</phase>
|
||||
<goals>
|
||||
<goal>copy-resources</goal>
|
||||
|
|
@ -258,37 +263,11 @@
|
|||
<outputDirectory>${project.build.directory}/classes/python-runtime</outputDirectory>
|
||||
<resources>
|
||||
<resource>
|
||||
<directory>packaging/python/python-3.9-linux-static</directory>
|
||||
<excludes>
|
||||
<exclude>**/*.pyc</exclude>
|
||||
<exclude>**/__pycache__/**</exclude>
|
||||
<exclude>**/test/**</exclude>
|
||||
<exclude>**/tests/**</exclude>
|
||||
</excludes>
|
||||
</resource>
|
||||
</resources>
|
||||
</configuration>
|
||||
</execution>
|
||||
<!-- Package Python virtual environment for offline deployment -->
|
||||
<execution>
|
||||
<id>package-python-venv</id>
|
||||
<phase>process-resources</phase>
|
||||
<goals>
|
||||
<goal>copy-resources</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<outputDirectory>${project.build.directory}/classes/python-runtime/venv-offline</outputDirectory>
|
||||
<resources>
|
||||
<resource>
|
||||
<directory>packaging/python/venv-linux-offline</directory>
|
||||
<excludes>
|
||||
<exclude>**/*.pyc</exclude>
|
||||
<exclude>**/__pycache__/**</exclude>
|
||||
<exclude>**/tests/**</exclude>
|
||||
<exclude>**/test/**</exclude>
|
||||
<exclude>**/*.md</exclude>
|
||||
<exclude>**/*.dist-info/**</exclude>
|
||||
</excludes>
|
||||
<directory>packaging/python</directory>
|
||||
<includes>
|
||||
<include>python-runtime.tar.gz</include>
|
||||
<include>venv-offline.tar.gz</include>
|
||||
</includes>
|
||||
</resource>
|
||||
</resources>
|
||||
</configuration>
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
# Offline Python Requirements - Fixed Versions for Reproducibility
|
||||
# Generated: 2026-03-04
|
||||
# Target: Linux x86_64, Python 3.9
|
||||
# Target: Linux x86_64, Python 3.10
|
||||
|
||||
# Core OCR Dependencies
|
||||
paddleocr==2.7.5
|
||||
paddlepaddle==2.5.2
|
||||
opencv-python==4.8.0.76
|
||||
paddlepaddle==2.6.2
|
||||
opencv-python==4.6.0.66
|
||||
pymupdf==1.23.0
|
||||
pikepdf==8.0.0
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,471 @@
|
|||
package com.chinaweal.youfool.reportdetect.common.utils;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.core.io.Resource;
|
||||
import org.springframework.core.io.support.PathMatchingResourcePatternResolver;
|
||||
import org.springframework.core.io.support.ResourcePatternResolver;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import javax.annotation.PostConstruct;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.*;
|
||||
import java.nio.file.attribute.BasicFileAttributes;
|
||||
import java.util.Enumeration;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.jar.JarEntry;
|
||||
import java.util.jar.JarFile;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||
|
||||
/**
|
||||
* Extracts Python resources and model files from JAR to working directory
|
||||
*/
|
||||
@Component
|
||||
public class ResourceExtractor {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(ResourceExtractor.class);
|
||||
|
||||
@Value("${app.ocr.resource-dir:./ocr-resources}")
|
||||
private String resourceDir;
|
||||
|
||||
@Value("${app.ocr.models-dir:./models}")
|
||||
private String modelsDir;
|
||||
|
||||
@Value("${app.ocr.extract-on-startup:true}")
|
||||
private boolean extractOnStartup;
|
||||
|
||||
@PostConstruct
|
||||
public void init() {
|
||||
if (extractOnStartup) {
|
||||
try {
|
||||
extractIfMissing();
|
||||
} catch (IOException e) {
|
||||
log.error("Failed to extract resources on startup", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract Python code and models if they don't exist
|
||||
*/
|
||||
public void extractIfMissing() throws IOException {
|
||||
extractPythonCode();
|
||||
extractModels();
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract all offline resources (Python runtime, venv, models, scripts)
|
||||
* Used for complete offline deployment
|
||||
*/
|
||||
public void extractAllResources() throws IOException {
|
||||
log.info("========================================");
|
||||
log.info("开始提取离线资源");
|
||||
log.info("========================================");
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
|
||||
// 1. Extract Python runtime
|
||||
extractPythonRuntime();
|
||||
|
||||
// 2. Extract virtual environment
|
||||
extractVirtualEnvironment();
|
||||
|
||||
// 3. Extract OCR models
|
||||
extractOcrModels();
|
||||
|
||||
// 4. Extract Python scripts
|
||||
extractPythonCode();
|
||||
|
||||
long duration = System.currentTimeMillis() - startTime;
|
||||
log.info("========================================");
|
||||
log.info("✓ 离线资源提取完成 (耗时: {}秒)", duration / 1000.0);
|
||||
log.info("========================================");
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract Python API code from JAR to working directory
|
||||
*/
|
||||
private void extractPythonCode() throws IOException {
|
||||
Path targetDir = Paths.get(resourceDir);
|
||||
|
||||
if (Files.exists(targetDir) &&
|
||||
Files.list(targetDir).findAny().isPresent()) {
|
||||
log.info("Python resources already exist at {}", targetDir);
|
||||
return;
|
||||
}
|
||||
|
||||
log.info("Extracting Python resources to {}", targetDir);
|
||||
Files.createDirectories(targetDir);
|
||||
|
||||
// List of Python resources to extract
|
||||
Map<String, String> pythonResources = new HashMap<>();
|
||||
pythonResources.put("python_api/ocr_api_server.py", "ocr_api_server.py");
|
||||
pythonResources.put("python_api/ocr_task_consumer.py", "ocr_task_consumer.py");
|
||||
pythonResources.put("src/main/python/pdf_processor.py", "pdf_processor.py");
|
||||
pythonResources.put("test_accuracy_batch_full.py", "test_accuracy_batch_full.py");
|
||||
|
||||
for (Map.Entry<String, String> entry : pythonResources.entrySet()) {
|
||||
String sourcePath = entry.getKey();
|
||||
String targetName = entry.getValue();
|
||||
extractResource(sourcePath, targetDir.resolve(targetName));
|
||||
}
|
||||
|
||||
log.info("Python resources extracted successfully");
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract model files from JAR to working directory
|
||||
* NOTE: This is a placeholder - actual model extraction depends on how models are packaged
|
||||
* For large models (3-5GB), consider:
|
||||
* 1. External model download on first run
|
||||
* 2. Separate model package
|
||||
* 3. Docker volume mount
|
||||
*/
|
||||
private void extractModels() throws IOException {
|
||||
Path targetDir = Paths.get(modelsDir);
|
||||
|
||||
if (Files.exists(targetDir)) {
|
||||
// Check if essential models exist
|
||||
Path[] essentialModels = {
|
||||
targetDir.resolve("pp-ocrv5/PP-OCRv5_server_det_onnx/inference.onnx"),
|
||||
targetDir.resolve("pp-ocrv5/PP-OCRv5_server_rec_onnx/inference.onnx")
|
||||
};
|
||||
|
||||
boolean allExist = true;
|
||||
for (Path model : essentialModels) {
|
||||
if (!Files.exists(model)) {
|
||||
allExist = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (allExist) {
|
||||
log.info("Models directory already exists with essential models at {}", targetDir);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
log.warn("Models directory missing or incomplete at {}", targetDir);
|
||||
log.info("Models will be auto-downloaded by PaddleOCR on first use");
|
||||
|
||||
// Create directory structure
|
||||
Files.createDirectories(targetDir);
|
||||
|
||||
// NOTE: For production, implement one of these strategies:
|
||||
//
|
||||
// Strategy 1: Download from CDN
|
||||
// downloadModelsFromCdn(targetDir);
|
||||
//
|
||||
// Strategy 2: Extract from JAR (if bundled)
|
||||
// extractModelsFromJar(targetDir);
|
||||
//
|
||||
// Strategy 3: Docker volume mount (recommended)
|
||||
// Models are mounted as volume at runtime
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract a single resource from classpath to target path
|
||||
*/
|
||||
private void extractResource(String resourcePath, Path targetPath) throws IOException {
|
||||
InputStream is = getClass().getClassLoader().getResourceAsStream(resourcePath);
|
||||
|
||||
if (is == null) {
|
||||
log.warn("Resource not found in classpath: {}", resourcePath);
|
||||
|
||||
// Try extracting from JAR directly
|
||||
if (isRunningInJar()) {
|
||||
extractFromJar(resourcePath, targetPath);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
Files.copy(is, targetPath, StandardCopyOption.REPLACE_EXISTING);
|
||||
is.close();
|
||||
log.debug("Extracted: {} -> {}", resourcePath, targetPath);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract file from JAR (for resources not found via classpath)
|
||||
*/
|
||||
private void extractFromJar(String resourcePath, Path targetPath) throws IOException {
|
||||
String jarPath = getClass().getProtectionDomain().getCodeSource().getLocation().getPath();
|
||||
|
||||
if (jarPath == null || !jarPath.endsWith(".jar")) {
|
||||
return;
|
||||
}
|
||||
|
||||
try (JarFile jarFile = new JarFile(jarPath)) {
|
||||
Enumeration<JarEntry> entries = jarFile.entries();
|
||||
|
||||
while (entries.hasMoreElements()) {
|
||||
JarEntry entry = entries.nextElement();
|
||||
String entryName = entry.getName();
|
||||
|
||||
if (entryName.equals(resourcePath) || entryName.endsWith("/" + resourcePath)) {
|
||||
try (InputStream is = jarFile.getInputStream(entry)) {
|
||||
Files.createDirectories(targetPath.getParent());
|
||||
Files.copy(is, targetPath, StandardCopyOption.REPLACE_EXISTING);
|
||||
log.debug("Extracted from JAR: {} -> {}", entryName, targetPath);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log.warn("Resource not found in JAR: {}", resourcePath);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if application is running from JAR
|
||||
*/
|
||||
private boolean isRunningInJar() {
|
||||
String protocol = getClass().getProtectionDomain().getCodeSource().getLocation().getProtocol();
|
||||
return "jar".equals(protocol);
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursively delete a directory
|
||||
*/
|
||||
public void deleteDirectory(Path directory) throws IOException {
|
||||
if (Files.exists(directory)) {
|
||||
Files.walkFileTree(directory, new SimpleFileVisitor<Path>() {
|
||||
@Override
|
||||
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
|
||||
Files.delete(file);
|
||||
return FileVisitResult.CONTINUE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException {
|
||||
Files.delete(dir);
|
||||
return FileVisitResult.CONTINUE;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
public String getResourceDir() {
|
||||
return resourceDir;
|
||||
}
|
||||
|
||||
public String getModelsDir() {
|
||||
return modelsDir;
|
||||
}
|
||||
|
||||
public boolean isExtractOnStartup() {
|
||||
return extractOnStartup;
|
||||
}
|
||||
|
||||
// ========== Offline Deployment Methods ==========
|
||||
|
||||
private void extractPythonRuntime() throws IOException {
|
||||
Path targetDir = Paths.get(resourceDir, "python-runtime");
|
||||
|
||||
if (Files.exists(targetDir.resolve("python/bin/python3.10")) ||
|
||||
Files.exists(targetDir.resolve("python/bin/python3"))) {
|
||||
log.info("[1/4] Python runtime already exists, skipping");
|
||||
return;
|
||||
}
|
||||
|
||||
log.info("[1/4] Extracting Python runtime...");
|
||||
Files.createDirectories(targetDir);
|
||||
if (!extractTarGzResource("python-runtime/python-runtime.tar.gz", targetDir)) {
|
||||
copyDirectoryFromClasspath("python-runtime", targetDir);
|
||||
}
|
||||
makeExecutable(targetDir.resolve("python/bin/python3.10"));
|
||||
makeExecutable(targetDir.resolve("python/bin/python3"));
|
||||
log.info(" Done");
|
||||
}
|
||||
|
||||
private void extractVirtualEnvironment() throws IOException {
|
||||
Path targetDir = Paths.get(resourceDir, "python-runtime", "venv-offline");
|
||||
|
||||
if (Files.exists(targetDir.resolve("bin/python3.10")) ||
|
||||
Files.exists(targetDir.resolve("bin/python3"))) {
|
||||
log.info("[2/4] Python venv already exists, skipping");
|
||||
return;
|
||||
}
|
||||
|
||||
log.info("[2/4] Extracting Python venv...");
|
||||
Files.createDirectories(targetDir.getParent());
|
||||
if (!extractTarGzResource("python-runtime/venv-offline.tar.gz", targetDir.getParent())) {
|
||||
Files.createDirectories(targetDir);
|
||||
copyDirectoryFromClasspath("python-runtime/venv-offline", targetDir);
|
||||
}
|
||||
makeExecutable(targetDir.resolve("bin/python3.10"));
|
||||
makeExecutable(targetDir.resolve("bin/python3"));
|
||||
log.info(" Done");
|
||||
}
|
||||
|
||||
private void extractOcrModels() throws IOException {
|
||||
Path targetDir = Paths.get(modelsDir);
|
||||
|
||||
if (Files.exists(targetDir.resolve("pp-ocrv5/det_model/inference.onnx"))) {
|
||||
log.info("[3/4] OCR模型已存在,跳过");
|
||||
return;
|
||||
}
|
||||
|
||||
log.info("[3/4] 提取OCR模型...");
|
||||
Files.createDirectories(targetDir);
|
||||
copyDirectoryFromClasspath("models", targetDir);
|
||||
log.info(" ✓ 完成");
|
||||
}
|
||||
|
||||
private void copyDirectoryFromClasspath(String resourcePath, Path targetDir) throws IOException {
|
||||
log.info(" 复制资源: {}", resourcePath);
|
||||
|
||||
ResourcePatternResolver resolver = new PathMatchingResourcePatternResolver();
|
||||
Resource[] resources = resolver.getResources("classpath*:" + resourcePath + "/**");
|
||||
|
||||
int fileCount = 0;
|
||||
for (Resource resource : resources) {
|
||||
if (resource.isReadable() && !resource.getURL().toString().endsWith("/")) {
|
||||
try {
|
||||
String path = resource.getURL().getPath();
|
||||
String relativePath = path.substring(path.indexOf(resourcePath));
|
||||
Path targetPath = targetDir.resolve(relativePath.substring(resourcePath.length()));
|
||||
|
||||
Files.createDirectories(targetPath.getParent());
|
||||
Files.copy(resource.getInputStream(), targetPath,
|
||||
StandardCopyOption.REPLACE_EXISTING);
|
||||
fileCount++;
|
||||
} catch (Exception e) {
|
||||
log.debug("跳过文件: {}", resource.getFilename());
|
||||
}
|
||||
}
|
||||
}
|
||||
log.info(" 复制了 {} 个文件", fileCount);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract a .tar.gz resource from classpath into target directory.
|
||||
* Returns true if extracted, false if resource not found.
|
||||
*/
|
||||
private boolean extractTarGzResource(String resourceName, Path targetDir) throws IOException {
|
||||
ResourcePatternResolver resolver = new PathMatchingResourcePatternResolver();
|
||||
Resource resource = resolver.getResource("classpath:" + resourceName);
|
||||
if (!resource.exists()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
log.info(" Extracting archive: {}", resourceName);
|
||||
Files.createDirectories(targetDir);
|
||||
|
||||
// Read to a temp file first to avoid nested-jar stream issues
|
||||
Path tmp = Files.createTempFile("ocr-archive-", ".tar.gz");
|
||||
try (InputStream is = resource.getInputStream()) {
|
||||
Files.copy(is, tmp, StandardCopyOption.REPLACE_EXISTING);
|
||||
}
|
||||
|
||||
long tmpSize = Files.size(tmp);
|
||||
if (tmpSize <= 0) {
|
||||
log.warn(" Archive size is 0 bytes: {}", resourceName);
|
||||
try {
|
||||
Files.deleteIfExists(tmp);
|
||||
} catch (IOException ignore) {
|
||||
}
|
||||
return false;
|
||||
}
|
||||
log.info(" Archive temp file: {} ({} bytes)", tmp.toAbsolutePath(), tmpSize);
|
||||
|
||||
// Prefer system tar on Linux for reliability
|
||||
if (isLinux()) {
|
||||
try {
|
||||
ProcessBuilder pb = new ProcessBuilder(
|
||||
"tar", "-xzf", tmp.toAbsolutePath().toString(), "-C", targetDir.toAbsolutePath().toString());
|
||||
pb.redirectErrorStream(true);
|
||||
Process p = pb.start();
|
||||
StringBuilder out = new StringBuilder();
|
||||
try (java.io.BufferedReader br = new java.io.BufferedReader(
|
||||
new java.io.InputStreamReader(p.getInputStream()))) {
|
||||
String line;
|
||||
while ((line = br.readLine()) != null) {
|
||||
out.append(line).append('\n');
|
||||
}
|
||||
}
|
||||
int code = p.waitFor();
|
||||
if (code == 0) {
|
||||
log.info(" System tar extraction succeeded: {}", resourceName);
|
||||
try {
|
||||
Files.deleteIfExists(tmp);
|
||||
} catch (IOException ignore) {
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
log.warn(" System tar extraction failed (code {}): {}", code, resourceName);
|
||||
if (out.length() > 0) {
|
||||
log.warn(" System tar output: {}", out.toString().trim());
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.warn(" System tar extraction error: {}", e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
int extracted = 0;
|
||||
try (InputStream fis = Files.newInputStream(tmp);
|
||||
GZIPInputStream gis = new GZIPInputStream(fis);
|
||||
TarArchiveInputStream tis = new TarArchiveInputStream(gis)) {
|
||||
|
||||
TarArchiveEntry entry;
|
||||
while ((entry = tis.getNextTarEntry()) != null) {
|
||||
String entryName = entry.getName();
|
||||
if (entryName == null || entryName.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
Path outPath = targetDir.resolve(entryName).normalize();
|
||||
if (!outPath.startsWith(targetDir)) {
|
||||
continue;
|
||||
}
|
||||
if (entry.isDirectory()) {
|
||||
Files.createDirectories(outPath);
|
||||
continue;
|
||||
}
|
||||
if (entry.isSymbolicLink() || entry.isLink()) {
|
||||
// Skip symlinks to keep Windows compatibility
|
||||
continue;
|
||||
}
|
||||
Files.createDirectories(outPath.getParent());
|
||||
Files.copy(tis, outPath, StandardCopyOption.REPLACE_EXISTING);
|
||||
extracted++;
|
||||
}
|
||||
}
|
||||
if (extracted == 0) {
|
||||
log.warn(" Archive extracted 0 files: {}", resourceName);
|
||||
} else {
|
||||
log.info(" Extracted {} files from {}", extracted, resourceName);
|
||||
}
|
||||
try {
|
||||
Files.deleteIfExists(tmp);
|
||||
} catch (IOException ignore) {
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean isLinux() {
|
||||
String os = System.getProperty("os.name").toLowerCase();
|
||||
return os.contains("linux");
|
||||
}
|
||||
|
||||
private void makeExecutable(Path file) {
|
||||
if (Files.exists(file)) {
|
||||
try {
|
||||
file.toFile().setExecutable(true);
|
||||
// Also try chmod on Linux/Unix
|
||||
if (System.getProperty("os.name").toLowerCase().contains("linux") ||
|
||||
System.getProperty("os.name").toLowerCase().contains("unix") ||
|
||||
System.getProperty("os.name").toLowerCase().contains("mac")) {
|
||||
Runtime.getRuntime().exec(new String[]{"chmod", "+x", file.toString()});
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.warn("无法设置可执行权限: {}", file);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,424 @@
|
|||
package com.chinaweal.youfool.reportdetect.modules.ocr.engine;
|
||||
|
||||
import com.chinaweal.youfool.reportdetect.common.utils.ResourceExtractor;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.context.ApplicationListener;
|
||||
import org.springframework.context.event.ContextRefreshedEvent;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import javax.annotation.PreDestroy;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.URL;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/**
|
||||
* Manages Flask OCR API process lifecycle
|
||||
* - Auto-starts Flask on application startup
|
||||
* - Monitors Flask health
|
||||
* - Gracefully shuts down on application exit
|
||||
*/
|
||||
@Component
|
||||
public class FlaskProcessManager implements ApplicationListener<ContextRefreshedEvent> {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(FlaskProcessManager.class);
|
||||
|
||||
@Value("${app.ocr.flask.enabled:true}")
|
||||
private boolean flaskEnabled;
|
||||
|
||||
@Value("${app.ocr.flask.host:127.0.0.1}")
|
||||
private String flaskHost;
|
||||
|
||||
@Value("${app.ocr.flask.port:8081}")
|
||||
private int flaskPort;
|
||||
|
||||
@Value("${app.ocr.flask.startup-timeout:60}")
|
||||
private int startupTimeoutSeconds;
|
||||
|
||||
@Value("${app.ocr.flask.disable-model-source-check:true}")
|
||||
private boolean disableModelSourceCheck;
|
||||
|
||||
@Value("${app.ocr.python-command:python}")
|
||||
private String pythonCommand;
|
||||
|
||||
@Value("${app.ocr.resource-dir:./ocr-resources}")
|
||||
private String resourceDir;
|
||||
|
||||
@Value("${app.ocr.models-dir:./models}")
|
||||
private String modelsDir;
|
||||
|
||||
@Autowired
|
||||
private ResourceExtractor resourceExtractor;
|
||||
|
||||
private Process flaskProcess;
|
||||
private boolean flaskReady = false;
|
||||
|
||||
@Override
|
||||
public void onApplicationEvent(ContextRefreshedEvent event) {
|
||||
if (flaskEnabled) {
|
||||
startFlaskProcess();
|
||||
} else {
|
||||
log.info("Flask process management is disabled");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Start Flask API server with embedded Python (offline mode)
|
||||
*/
|
||||
public synchronized void startFlaskProcess() {
|
||||
if (!flaskEnabled) {
|
||||
log.info("Flask process management is disabled");
|
||||
return;
|
||||
}
|
||||
if (flaskProcess != null && flaskProcess.isAlive()) {
|
||||
log.info("Flask OCR API server is already running");
|
||||
return;
|
||||
}
|
||||
log.info("Starting Flask OCR API server...");
|
||||
|
||||
try {
|
||||
// Extract all resources for offline mode
|
||||
resourceExtractor.extractAllResources();
|
||||
|
||||
// Determine Python executable and Flask API script path
|
||||
String pythonExecutable;
|
||||
Path apiScriptPath;
|
||||
|
||||
// Require embedded Python (Linux + Windows layouts)
|
||||
Path embeddedPython = resolveEmbeddedPython();
|
||||
if (embeddedPython != null) {
|
||||
pythonExecutable = embeddedPython.toString();
|
||||
log.info("Using embedded Python: {}", pythonExecutable);
|
||||
} else {
|
||||
log.error("Embedded Python not found. Refusing to start Flask with system Python.");
|
||||
log.error("Expected embedded runtime under: {}/python-runtime/python", resourceDir);
|
||||
return;
|
||||
}
|
||||
|
||||
// Try multiple locations for Flask API script
|
||||
// Priority 1: Project's python_api directory (development)
|
||||
apiScriptPath = Paths.get("./python_api/ocr_api_server.py");
|
||||
if (Files.exists(apiScriptPath)) {
|
||||
log.info("Found Flask API script at: {}", apiScriptPath);
|
||||
} else {
|
||||
// Priority 2: Embedded in ocr-resources (offline deployment)
|
||||
apiScriptPath = Paths.get(resourceDir, "python-api/ocr_api_server.py");
|
||||
if (Files.exists(apiScriptPath)) {
|
||||
log.info("Found Flask API script at: {}", apiScriptPath);
|
||||
} else {
|
||||
// Priority 3: Root of ocr-resources
|
||||
apiScriptPath = Paths.get(resourceDir, "ocr_api_server.py");
|
||||
if (Files.exists(apiScriptPath)) {
|
||||
log.info("Found Flask API script at: {}", apiScriptPath);
|
||||
} else {
|
||||
log.error("Flask API script not found at any of the following locations:");
|
||||
log.error(" 1. ./python_api/ocr_api_server.py");
|
||||
log.error(" 2. {}/python-api/ocr_api_server.py", resourceDir);
|
||||
log.error(" 3. {}/ocr_api_server.py", resourceDir);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Build command
|
||||
List<String> command = new ArrayList<>();
|
||||
command.add(pythonExecutable);
|
||||
command.add(apiScriptPath.toAbsolutePath().toString());
|
||||
|
||||
// Configure ProcessBuilder
|
||||
ProcessBuilder pb = new ProcessBuilder(command);
|
||||
// Set working directory to project root (where python_api is located)
|
||||
pb.directory(new File("."));
|
||||
|
||||
// Set environment variables for offline mode
|
||||
Map<String, String> env = pb.environment();
|
||||
env.put("PYTHONIOENCODING", "utf-8");
|
||||
env.put("PYTHONUNBUFFERED", "1");
|
||||
// Note: Don't set PYTHONNOUSERSITE=1 as it blocks loading system-installed Flask
|
||||
|
||||
// Use bundled libraries if available (Linux/Windows)
|
||||
Path libPath = resolveEmbeddedSitePackages();
|
||||
if (libPath != null) {
|
||||
env.put("PYTHONPATH", libPath.toString());
|
||||
}
|
||||
// Prefer embedded Python home when using standalone runtime
|
||||
Path pythonHome = resolveEmbeddedPythonHome();
|
||||
if (pythonHome != null) {
|
||||
env.put("PYTHONHOME", pythonHome.toString());
|
||||
}
|
||||
|
||||
// Use bundled models
|
||||
Path modelsPath = Paths.get(modelsDir).toAbsolutePath();
|
||||
env.put("PADDLEOCR_HOME", modelsPath.toString());
|
||||
env.put("HUB_HOME", modelsPath.toString());
|
||||
if (disableModelSourceCheck) {
|
||||
env.put("PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK", "True");
|
||||
}
|
||||
|
||||
// Flask configuration
|
||||
env.put("PORT", String.valueOf(flaskPort));
|
||||
env.put("HOST", flaskHost);
|
||||
|
||||
pb.redirectErrorStream(true);
|
||||
|
||||
log.info("Starting Flask with embedded Python: {}", pythonExecutable);
|
||||
log.info("Flask will listen on: http://{}:{}", flaskHost, flaskPort);
|
||||
|
||||
// Start process
|
||||
flaskProcess = pb.start();
|
||||
startFlaskLogReader();
|
||||
waitForFlaskReady();
|
||||
|
||||
if (flaskReady) {
|
||||
log.info("✓ Flask OCR API server started successfully (offline mode)");
|
||||
} else {
|
||||
log.warn("Flask OCR API server not ready yet; keeping process running for late readiness");
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
log.error("Failed to start Flask process (offline mode)", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Start thread to read Flask logs
|
||||
*/
|
||||
private void startFlaskLogReader() {
|
||||
Thread logReader = new Thread(() -> {
|
||||
try (BufferedReader reader = new BufferedReader(
|
||||
new InputStreamReader(flaskProcess.getInputStream()))) {
|
||||
|
||||
String line;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
log.info("[Flask] {}", line);
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
if (flaskProcess.isAlive()) {
|
||||
log.warn("Error reading Flask logs", e);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
logReader.setDaemon(true);
|
||||
logReader.setName("Flask-Log-Reader");
|
||||
logReader.start();
|
||||
}
|
||||
|
||||
/**
|
||||
* Wait for Flask to be ready by polling health endpoint
|
||||
*/
|
||||
private void waitForFlaskReady() {
|
||||
log.info("Waiting for Flask to be ready (timeout: {}s)", startupTimeoutSeconds);
|
||||
|
||||
String healthUrl = String.format("http://%s:%d/health", flaskHost, flaskPort);
|
||||
long startTime = System.currentTimeMillis();
|
||||
long timeoutMillis = TimeUnit.SECONDS.toMillis(startupTimeoutSeconds);
|
||||
|
||||
while (System.currentTimeMillis() - startTime < timeoutMillis) {
|
||||
if (!flaskProcess.isAlive()) {
|
||||
log.error("Flask process terminated unexpectedly");
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
HttpURLConnection conn = (HttpURLConnection) new URL(healthUrl).openConnection();
|
||||
conn.setRequestMethod("GET");
|
||||
conn.setConnectTimeout(2000);
|
||||
conn.setReadTimeout(2000);
|
||||
|
||||
int responseCode = conn.getResponseCode();
|
||||
if (responseCode == 200) {
|
||||
log.info("Flask health check passed");
|
||||
flaskReady = true;
|
||||
return;
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
// Not ready yet, continue waiting
|
||||
log.debug("Flask not ready yet: {}", e.getMessage());
|
||||
}
|
||||
|
||||
try {
|
||||
TimeUnit.SECONDS.sleep(2);
|
||||
} catch (InterruptedException ie) {
|
||||
Thread.currentThread().interrupt();
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
log.warn("Flask health check timeout after {}s", startupTimeoutSeconds);
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop Flask process gracefully
|
||||
*/
|
||||
@PreDestroy
|
||||
public void stopFlaskProcess() {
|
||||
if (flaskProcess != null && flaskProcess.isAlive()) {
|
||||
log.info("Stopping Flask OCR API server...");
|
||||
|
||||
// Try graceful shutdown first
|
||||
flaskProcess.destroy();
|
||||
|
||||
try {
|
||||
// Wait up to 10 seconds for process to terminate
|
||||
if (!flaskProcess.waitFor(10, TimeUnit.SECONDS)) {
|
||||
log.warn("Flask did not stop gracefully, forcing termination");
|
||||
flaskProcess.destroyForcibly();
|
||||
}
|
||||
|
||||
log.info("Flask OCR API server stopped");
|
||||
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
log.error("Interrupted while waiting for Flask to stop", e);
|
||||
}
|
||||
|
||||
flaskReady = false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if Flask is ready
|
||||
*/
|
||||
public boolean isFlaskReady() {
|
||||
return flaskReady && flaskProcess != null && flaskProcess.isAlive();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get Flask base URL
|
||||
*/
|
||||
public String getFlaskBaseUrl() {
|
||||
return String.format("http://%s:%d", flaskHost, flaskPort);
|
||||
}
|
||||
|
||||
/**
|
||||
* Restart Flask process
|
||||
*/
|
||||
public void restartFlask() {
|
||||
log.info("Restarting Flask OCR API server...");
|
||||
stopFlaskProcess();
|
||||
|
||||
try {
|
||||
TimeUnit.SECONDS.sleep(2);
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
|
||||
startFlaskProcess();
|
||||
}
|
||||
|
||||
public synchronized boolean ensureFlaskRunning() {
|
||||
if (!flaskEnabled) {
|
||||
log.info("Flask process management is disabled");
|
||||
return false;
|
||||
}
|
||||
if (flaskProcess != null && flaskProcess.isAlive() && flaskReady) {
|
||||
return true;
|
||||
}
|
||||
if (flaskProcess != null && flaskProcess.isAlive() && !flaskReady) {
|
||||
waitForFlaskReady();
|
||||
return flaskReady;
|
||||
}
|
||||
startFlaskProcess();
|
||||
return flaskReady;
|
||||
}
|
||||
|
||||
private Path resolveEmbeddedPython() {
|
||||
// Prefer embedded standalone runtime first
|
||||
Path linuxPython310 = Paths.get(resourceDir, "python-runtime/python/bin/python3.10");
|
||||
if (Files.exists(linuxPython310)) {
|
||||
return linuxPython310;
|
||||
}
|
||||
Path linuxPython3 = Paths.get(resourceDir, "python-runtime/python/bin/python3");
|
||||
if (Files.exists(linuxPython3)) {
|
||||
return linuxPython3;
|
||||
}
|
||||
Path linuxPython = Paths.get(resourceDir, "python-runtime/python/bin/python");
|
||||
if (Files.exists(linuxPython)) {
|
||||
return linuxPython;
|
||||
}
|
||||
// Fallback to venv interpreter if bundled (still embedded)
|
||||
linuxPython310 = Paths.get(resourceDir, "python-runtime/venv-offline/bin/python3.10");
|
||||
if (Files.exists(linuxPython310)) {
|
||||
return linuxPython310;
|
||||
}
|
||||
Path linuxPython311 = Paths.get(resourceDir, "python-runtime/venv-offline/bin/python3.11");
|
||||
if (Files.exists(linuxPython311)) {
|
||||
return linuxPython311;
|
||||
}
|
||||
Path linuxPython39 = Paths.get(resourceDir, "python-runtime/venv-offline/bin/python3.9");
|
||||
if (Files.exists(linuxPython39)) {
|
||||
return linuxPython39;
|
||||
}
|
||||
Path linuxPython38 = Paths.get(resourceDir, "python-runtime/venv-offline/bin/python3.8");
|
||||
if (Files.exists(linuxPython38)) {
|
||||
return linuxPython38;
|
||||
}
|
||||
Path linuxPythonVenv = Paths.get(resourceDir, "python-runtime/venv-offline/bin/python3");
|
||||
if (Files.exists(linuxPythonVenv)) {
|
||||
return linuxPythonVenv;
|
||||
}
|
||||
Path linuxPythonAlt = Paths.get(resourceDir, "python-runtime/venv-offline/bin/python");
|
||||
if (Files.exists(linuxPythonAlt)) {
|
||||
return linuxPythonAlt;
|
||||
}
|
||||
Path windowsPython = Paths.get(resourceDir, "python-runtime/venv-offline/Scripts/python.exe");
|
||||
if (Files.exists(windowsPython)) {
|
||||
return windowsPython;
|
||||
}
|
||||
Path windowsPythonAlt = Paths.get(resourceDir, "python-runtime/python.exe");
|
||||
if (Files.exists(windowsPythonAlt)) {
|
||||
return windowsPythonAlt;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private Path resolveEmbeddedPythonHome() {
|
||||
Path pythonHome = Paths.get(resourceDir, "python-runtime/python");
|
||||
if (Files.isDirectory(pythonHome)) {
|
||||
return pythonHome;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private Path resolveEmbeddedSitePackages() {
|
||||
// Linux venvs use lib/pythonX.Y/site-packages
|
||||
Path libRoot = Paths.get(resourceDir, "python-runtime/venv-offline/lib");
|
||||
if (Files.isDirectory(libRoot)) {
|
||||
try {
|
||||
try (java.nio.file.DirectoryStream<Path> stream = Files.newDirectoryStream(libRoot)) {
|
||||
for (Path p : stream) {
|
||||
if (Files.isDirectory(p) && p.getFileName().toString().startsWith("python")) {
|
||||
Path site = p.resolve("site-packages");
|
||||
if (Files.exists(site)) {
|
||||
return site;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
log.warn("Failed to scan embedded site-packages: {}", e.getMessage());
|
||||
}
|
||||
}
|
||||
// Windows venvs
|
||||
Path venvLibWin = Paths.get(resourceDir, "python-runtime/venv-offline/Lib/site-packages");
|
||||
if (Files.exists(venvLibWin)) {
|
||||
return venvLibWin;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue