From f5981fdf726878d0ad4f671e308773ef944c88e2 Mon Sep 17 00:00:00 2001 From: huangrh Date: Mon, 16 Feb 2026 21:22:23 +0800 Subject: [PATCH] fix(test): remove seal suffixes from institution names before matching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend institution name cleaning to handle OCR artifacts from seal text that gets merged with company names during extraction. Problem: - 3 PDFs failed matching due to "检验检测专用章" (Seal for Inspection & Testing) being included in extracted institution names - Example: "四川合泰与必摩适检测有限公司检验检测专用章" vs "四川合泰与必摩适检测有限公司" - Similarity dropped to ~60-67% → incorrectly classified as "no_match" - Affected PDFs: * pages3-6.pdf: 60.87% similarity * pages7-14.pdf: 60.0% similarity * pages12-15.pdf: 62.5% similarity Solution: - Add seal suffix removal to clean_institution_name() function - Remove common seal names: 检验检测专用章, 检测专用章, 检验专用章, etc. - Use string replacement (not regex) to handle middle-of-text occurrences - Apply before number removal to handle combined artifacts like "专用章123456" Test Results: All 4 test cases now achieve 100% similarity and "exact" match: 1. "检验检测专用章" suffix → 66.67% → 100.00% ✓ 2. "检验检测专用章" suffix (different company) → 65.00% → 100.00% ✓ 3. "430334" suffix → 70.00% → 100.00% ✓ 4. "检验检测专用章430334" combined → 51.85% → 100.00% ✓ This fix complements the previous CMA code suffix removal and significantly improves matching accuracy for seal-related OCR artifacts. Co-Authored-By: Claude Code --- test_accuracy_batch_full.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/test_accuracy_batch_full.py b/test_accuracy_batch_full.py index 743ecf6..080ef30 100644 --- a/test_accuracy_batch_full.py +++ b/test_accuracy_batch_full.py @@ -1511,7 +1511,7 @@ def extract_institution_from_crt(pdf_path: str) -> List[str]: def clean_institution_name(text: str) -> str: """ - 清理机构名称,移除末尾的数字、CMA码等干扰内容 + 清理机构名称,移除末尾的数字、CMA码、印章名称等干扰内容 Args: text: 原始机构名称 @@ -1522,6 +1522,19 @@ def clean_institution_name(text: str) -> str: if not text: return text + # 移除常见的印章名称(不需要在末尾,可以移除任何位置的) + # 这处理"机构名称检验检测专用章"或"机构名称检验检测专用章123456" + seal_patterns = [ + r'检验检测专用章', + r'检测专用章', + r'检验专用章', + r'鉴定专用章', + r'公章', + r'专用章', + ] + for pattern in seal_patterns: + text = text.replace(pattern, '') + # 移除末尾的数字序列(如CMA码) text = re.sub(r'\d{6,}$', '', text) # 6位及以上数字 text = re.sub(r'\d{11,}$', '', text) # 11位及以上数字(CMA码)