app/infrastructure/service/wechat/chat_snapshot_analyzer.py

from dataclasses import dataclass, asdict
from io import BytesIO
from pathlib import Path
import json
import re
import sys

import cv2
import numpy as np
from PIL import Image

# 项目根目录路径
ROOT = Path(__file__).resolve().parents[4]
sys.path.insert(0, str(ROOT))

from app.infrastructure.service.logging.log_service import log_event, new_trace_id
from app.infrastructure.service.wechat.config import (
    OCR_SAVE_DIR,
    OCR_TOP_PENALTY_RATIO,
    OCR_TOP_PENALTY_BIN_FACTOR,
    OCR_TOP_PENALTY_COLOR_FACTOR,
)
from app.infrastructure.service.wechat.ocr import OCRService

# 点击后的聊天截图目录
CLICKED_DIR = ROOT / OCR_SAVE_DIR / 'sessions' / 'clicked'
# OCR输出目录，包含子目录original、merged_binary、ocr_crops
OUT_DIR = ROOT / OCR_SAVE_DIR / 'sessions' / 'clicked_ocr'
# 原始截图保存目录
ORIGINAL_DIR = OUT_DIR / 'original'
# 合并二值化图片保存目录
MERGED_DIR = OUT_DIR / 'merged_binary'
# OCR裁剪区域保存目录
OCR_DIR = OUT_DIR / 'ocr_crops'
# 创建所有输出目录
for _dir in (OUT_DIR, ORIGINAL_DIR, MERGED_DIR, OCR_DIR):
    _dir.mkdir(parents=True, exist_ok=True)


# 获取图片输出目录，根据类别返回不同子目录路径
def get_image_out_dir(stem: str, category: str = 'ocr') -> Path:
    safe = stem or 'unknown'
    base = OCR_DIR
    if category == 'original':
        base = ORIGINAL_DIR
    elif category == 'merged':
        base = MERGED_DIR
    out = base / safe
    out.mkdir(parents=True, exist_ok=True)
    return out

# 日志记录器
# OCR服务单例
ocr = OCRService()

# 二值化和自适应阈值参数
BINARY_THRESHOLD = 248  # 固定二值化阈值
ADAPTIVE_BLOCK_SIZE = 13  # 自适应阈值块大小
ADAPTIVE_C = 1  # 自适应阈值偏移量
LEFT_MERGE_KERNEL = (5, 1)  # 左侧气泡的形态学操作核
DEFAULT_MERGE_KERNEL = (3, 1)  # 默认形态学操作核
OPEN_KERNEL = (2, 2)  # 开运算核大小

# 气泡颜色和位置启发式参数
PEER_BUBBLE_RGB = (238, 238, 240)  # 对方消息气泡颜色（#EEEEF0）
LEFT_BUBBLE_CX_RATIO = 0.58  # 左侧气泡中心x坐标比例阈值
RIGHT_BUBBLE_MIN_LEFT_RATIO = 0.48  # 右侧气泡最小左侧比例
LEFT_SIDE_MAX_RIGHT_RATIO = 0.78  # 左侧气泡最大右侧比例
LEFT_COLOR_THRESHOLD = 64  # 左侧气泡颜色差异阈值
RIGHT_COLOR_THRESHOLD = 108  # 右侧气泡颜色差异阈值
BRIGHTNESS_CUTOFF = 248  # 亮度截止值
BG_WHITE_THRESHOLD = 245  # 背景白色阈值
BRIGHT_MASK_THRESHOLD = 242  # 高亮遮罩阈值

# 连通组件分析的面积和尺寸阈值
MIN_AREA_BINARY = 700  # 二值图像最小连通区域面积
MIN_WIDTH_BINARY = 42  # 二值图像最小宽度
MIN_HEIGHT_BINARY = 24  # 二值图像最小高度
MIN_AREA_COLOR = 560  # 彩色图像最小连通区域面积
MIN_WIDTH_COLOR = 38  # 彩色图像最小宽度
MIN_HEIGHT_COLOR = 22  # 彩色图像最小高度

# 全二值化备选框评分参数
FULL_BIN_SCORE_BOTTOM = 5.0  # 底部位置得分系数
FULL_BIN_SCORE_AREA = 0.08  # 面积得分系数
FULL_BIN_SCORE_WIDTH = 0.15  # 宽度得分系数
FULL_BIN_PENALTY_TALL = 8.0  # 过高惩罚系数
FULL_BIN_PENALTY_WIDE = 3.0  # 过宽惩罚系数

# 最新消息候选框过滤规则（用于pick_last_white_box_from_merged_binary）
AVATAR_RATIO_MIN = 0.75
AVATAR_RATIO_MAX = 1.35
AVATAR_MIN_SIZE = 28
AVATAR_MAX_SIZE = 72
AVATAR_EDGE_LEFT_RATIO = 0.2
AVATAR_EDGE_RIGHT_RATIO = 0.8
TIME_MARKER_MIN_H = 10
TIME_MARKER_MAX_H = 22
TIME_MARKER_MIN_W = 28
TIME_MARKER_MAX_W = 110
TIME_MARKER_RATIO_MIN = 1.8
TIME_MARKER_RATIO_MAX = 8.0
TIME_MARKER_CENTER_TOLERANCE = 0.16

# 二值气泡框评分参数
BIN_SCORE_BOTTOM = 5.0  # 底部位置得分系数
BIN_SCORE_AREA = 0.12  # 面积得分系数
BIN_SCORE_OVERLAP = 220.0  # 重叠度得分系数
BIN_PENALTY_TOP_GAP_LEFT = 10.0  # 左侧气泡顶部间隙惩罚
BIN_PENALTY_TOP_GAP_RIGHT = 4.0  # 右侧气泡顶部间隙惩罚
BIN_PENALTY_BOTTOM_GAP = 1.8  # 底部间隙惩罚
BIN_PENALTY_TOO_TALL = 9.0  # 过高惩罚
BIN_PENALTY_TOO_WIDE = 2.0  # 过宽惩罚

# 彩色气泡框评分参数
COLOR_SCORE_AREA = 0.35  # 面积得分系数
COLOR_SCORE_BOTTOM = 3.0  # 底部位置得分系数
COLOR_SCORE_OVERLAP = 180.0  # 重叠度得分系数
COLOR_PENALTY_TOP_GAP_LEFT = 7.5  # 左侧气泡顶部间隙惩罚
COLOR_PENALTY_BOTTOM_GAP = 1.6  # 底部间隙惩罚
COLOR_PENALTY_TOO_TALL = 7.0  # 过高惩罚
COLOR_PENALTY_TOO_WIDE = 1.6  # 过宽惩罚
COLOR_SCORE_TOP_GAP_RIGHT = 1.0  # 右侧气泡顶部间隙得分
COLOR_SCORE_HEIGHT_RIGHT = 2.0  # 右侧气泡高度得分

# 最新消息簇评分参数
CLUSTER_SCORE_BOTTOM = 2.2  # 簇底部位置得分系数
CLUSTER_SCORE_HEIGHT = 0.9  # 簇高度得分系数
CLUSTER_SCORE_WIDTH = 0.12  # 簇宽度得分系数
CLUSTER_SCORE_TEXT_LEN = 1.5  # 文本长度得分系数
CLUSTER_SCORE_MULTI_LINE_BONUS = 35.0  # 多行消息额外加分

# 非正文文本过滤：精确匹配的内容
NON_BODY_EXACT_TEXTS = {
    '小程序',
    ':S',
}
# 非正文文本过滤：包含这些关键词的内容
NON_BODY_CONTAINS_TEXTS = (
    '有事请@其他福利官',
    '我是机器人',
    '群主小助手',
)


# 聊天快照分析结果数据类
@dataclass
class AnalyzeResult:
    file: str
    size: tuple[int, int] | None
    crop_box: dict | None
    latest_text: str | None
    is_self_sent: bool | None
    bubble_side: str | None
    confidence: float
    valid_lines: list[str]
    error: str | None


# 将PIL图片转换为PNG格式的字节数据
def pil_to_bytes(img: Image.Image) -> bytes:
    buf = BytesIO()
    img.save(buf, format='PNG')
    return buf.getvalue()


# 对图片进行OCR识别，返回识别到的文本行列表
def ocr_lines(img: Image.Image, scene: str) -> list[str]:
    return [x.strip() for x in ocr.recognize(pil_to_bytes(img), scene=scene) if x and str(x).strip()]


def ocr_items_direct(img: Image.Image) -> list[dict]:
    provider = getattr(ocr, 'rapid_provider', None)
    if provider is None or not provider.ensure_ready() or provider.engine is None:
        return []
    arr = cv2.cvtColor(np.array(img.convert('RGB')), cv2.COLOR_RGB2BGR)
    result = provider.engine(arr)
    if not result or len(result) < 1:
        return []
    rec_res = result[0] or []
    items = []
    for item in rec_res:
        if not item or len(item) < 2:
            continue
        box = np.array(item[0], dtype=np.float32)
        text = str(item[1]).strip()
        confidence = float(item[2]) if len(item) > 2 else 0.0
        if not text:
            continue
        items.append({
            'text': text,
            'confidence': confidence,
            'left': float(box[:, 0].min()),
            'top': float(box[:, 1].min()),
            'right': float(box[:, 0].max()),
            'bottom': float(box[:, 1].max()),
        })
    return items


# 构建合并的二值化图像，结合固定阈值和自适应阈值进行文字区域提取
def build_merged_binary_image(img: Image.Image, *, is_left_bubble: bool | None = None) -> Image.Image:
    arr = np.array(img.convert('RGB'))
    gray = cv2.cvtColor(arr, cv2.COLOR_RGB2GRAY)
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    _, binary_inv = cv2.threshold(blurred, BINARY_THRESHOLD, 255, cv2.THRESH_BINARY_INV)
    adaptive_inv = cv2.adaptiveThreshold(
        blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY_INV, ADAPTIVE_BLOCK_SIZE, ADAPTIVE_C
    )
    merged = cv2.bitwise_or(binary_inv, adaptive_inv)
    # 接近极限收缩：几乎只保留细小横向连接，尽量打断大白块
    if is_left_bubble is True:
        merge_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, LEFT_MERGE_KERNEL)
        merged = cv2.morphologyEx(merged, cv2.MORPH_CLOSE, merge_kernel, iterations=1)
        merged = cv2.morphologyEx(merged, cv2.MORPH_OPEN, cv2.getStructuringElement(cv2.MORPH_RECT, OPEN_KERNEL))
    else:
        merge_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, DEFAULT_MERGE_KERNEL)
        merged = cv2.morphologyEx(merged, cv2.MORPH_CLOSE, merge_kernel, iterations=1)
        merged = cv2.morphologyEx(merged, cv2.MORPH_OPEN, cv2.getStructuringElement(cv2.MORPH_RECT, OPEN_KERNEL))
    return Image.fromarray(merged)


# 从全二值化图像中选择最佳文字区域框
def pick_box_from_full_binary(img: Image.Image) -> dict | None:
    merged = np.array(build_merged_binary_image(img))
    num_labels, _, stats, _ = cv2.connectedComponentsWithStats(merged, 8)
    h, w = merged.shape[:2]
    best = None
    for label in range(1, num_labels):
        x, y, bw, bh, area = stats[label]
        if area < 300:
            continue
        if bw < 50 or bh < 24:
            continue
        right = x + bw
        bottom = y + bh
        score = 0.0
        score += bottom * FULL_BIN_SCORE_BOTTOM
        score += min(area, 20000) * FULL_BIN_SCORE_AREA
        score += min(bw, int(w * 0.7)) * FULL_BIN_SCORE_WIDTH
        if bh > h * 0.45:
            score -= (bh - h * 0.45) * FULL_BIN_PENALTY_TALL
        if bw > w * 0.65:
            score -= (bw - w * 0.65) * FULL_BIN_PENALTY_WIDE
        if best is None or score > best[0]:
            best = (score, x, y, right, bottom)
    if best is None:
        return None
    _, left, top, right, bottom = best
    return {
        'left': int(left),
        'top': int(top),
        'right': int(right),
        'bottom': int(bottom),
        'width': int(right - left),
        'height': int(bottom - top),
    }


# 从合并的二值化图像中选择最下方（最新）的白色文字框
def pick_last_white_box_from_merged_binary(merged_img: Image.Image) -> dict | None:
    merged = np.array(merged_img.convert('L'))
    img_h, img_w = merged.shape[:2]
    num_labels, _, stats, _ = cv2.connectedComponentsWithStats(merged, 8)
    boxes = []
    for label in range(1, num_labels):
        x, y, w, h, area = stats[label]
        if area <= 0 or w <= 0 or h <= 0:
            continue
        right = x + w
        bottom = y + h
        ratio = w / max(1.0, float(h))
        is_full_width_strip = x <= img_w * 0.02 and right >= img_w * 0.98 and w >= img_w * 0.9
        if is_full_width_strip:
            continue
        if w < 32 or h < 12:
            continue

        # 规则1：过滤边缘头像框（近似正方形 + 固定尺寸 + 处于左右边缘）
        is_avatar_like = AVATAR_RATIO_MIN <= ratio <= AVATAR_RATIO_MAX and AVATAR_MIN_SIZE <= w <= AVATAR_MAX_SIZE and AVATAR_MIN_SIZE <= h <= AVATAR_MAX_SIZE
        is_edge_avatar = x <= img_w * AVATAR_EDGE_LEFT_RATIO or right >= img_w * AVATAR_EDGE_RIGHT_RATIO
        if is_avatar_like and is_edge_avatar:
            continue

        # 规则2：过滤中间时间标记（短高、细长、位于屏幕中轴附近）
        cx = (x + right) / 2.0
        is_time_marker_like = TIME_MARKER_MIN_H <= h <= TIME_MARKER_MAX_H and TIME_MARKER_MIN_W <= w <= TIME_MARKER_MAX_W and TIME_MARKER_RATIO_MIN <= ratio <= TIME_MARKER_RATIO_MAX and abs(cx - img_w / 2.0) <= img_w * TIME_MARKER_CENTER_TOLERANCE
        if is_time_marker_like:
            continue

        is_left_box = cx <= img_w * LEFT_BUBBLE_CX_RATIO and x <= img_w * RIGHT_BUBBLE_MIN_LEFT_RATIO
        if not is_left_box:
            continue

        boxes.append((x, y, right, bottom, w, h, area))

    if not boxes:
        return None

    candidates = []
    for x, y, right, bottom, w, h, area in boxes:
        score = 0.0
        score += bottom * 1.0
        score += right * 0.08
        score += min(area, 28000) * 0.0012
        if right < img_w * 0.35:
            score -= 80.0
        if right > img_w * 0.82:
            score -= (right - img_w * 0.82) * 1.4
        if w > img_w * 0.45 and right > img_w * 0.75:
            score -= min(260.0, (w - img_w * 0.45) * 1.2 + (right - img_w * 0.75) * 1.0)
        candidates.append((score, x, y, right, bottom))

    candidates.sort(key=lambda item: item[0])
    _, left, top, right, bottom = candidates[-1]
    return {
        'left': int(left),
        'top': int(top),
        'right': int(right),
        'bottom': int(bottom),
        'width': int(right - left),
        'height': int(bottom - top),
    }


# 使用OCR引擎识别图片中的文字和位置信息
def ocr_items_with_boxes(img: Image.Image, offset_x: int = 0, offset_y: int = 0) -> list[dict]:
    provider = getattr(ocr, 'provider', None)
    engine = getattr(provider, 'engine', None)
    ready = getattr(provider, 'ready', False)
    if not ready or engine is None:
        return []
    try:
        result = engine.ocr(np.array(img.convert('RGB')))
    except Exception:
        return []
    items = []
    if not isinstance(result, list):
        return items
    for block in result:
        if not isinstance(block, dict):
            continue
        texts = block.get('rec_texts') or []
        scores = block.get('rec_scores') or []
        boxes = block.get('rec_boxes')
        if boxes is None:
            boxes = []
        for idx, text in enumerate(texts):
            text = str(text).strip()
            if not text:
                continue
            score = float(scores[idx]) if idx < len(scores) and scores[idx] is not None else 0.0
            if score < 0.3 or idx >= len(boxes):
                continue
            box = np.array(boxes[idx]).astype(float)
            if box.ndim == 1:
                if box.size < 4:
                    continue
                left, top, right, bottom = float(box[0]), float(box[1]), float(box[2]), float(box[3])
                cx = (left + right) / 2.0
                cy = (top + bottom) / 2.0
            else:
                xs = box[:, 0]
                ys = box[:, 1]
                left, right = float(xs.min()), float(xs.max())
                top, bottom = float(ys.min()), float(ys.max())
                cx = float(xs.mean())
                cy = float(ys.mean())
            items.append({
                'text': text,
                'score': score,
                'left': left + offset_x,
                'right': right + offset_x,
                'top': top + offset_y,
                'bottom': bottom + offset_y,
                'cx': cx + offset_x,
                'cy': cy + offset_y,
                'height': (bottom - top),
                'width': (right - left),
            })
    return items


# 判断OCR识别出的文本行是否应该保留
def should_keep_line(text: str) -> bool:
    text = (text or '').strip()
    return bool(text)


# 清理候选文本行，去除非正文前缀和无效符号
def cleanup_candidate_lines(lines: list[str]) -> list[str]:
    cleaned = []
    for idx, raw in enumerate(lines):
        text = str(raw).strip()
        if not text:
            continue

        cleaned.append(text)

    # 如果最后只剩一个很短的符号行，优先回退到其上一条正文
    if len(cleaned) >= 2 and len(cleaned[-1]) <= 3 and re.fullmatch(r'[:：;；SsxX]+', cleaned[-1]):
        cleaned = cleaned[:-1]

    # 若仍有明显非正文前缀挂在最上面，继续剥掉

    return cleaned


# 去重文本行列表，移除重复内容
def dedupe_lines(lines: list[str]) -> list[str]:
    out = []
    seen = set()
    for line in lines:
        text = str(line).strip()
        if not text or text in seen:
            continue
        seen.add(text)
        out.append(text)
    return out


# 根据几何位置去重OCR识别项，移除位置和文本都相同的重复项
def dedupe_items_by_geometry(items: list[dict]) -> list[dict]:
    if not items:
        return []
    items = sorted(items, key=lambda x: (-x.get('score', 0.0), x['top'], x['left']))
    kept: list[dict] = []
    for item in items:
        text = item['text'].strip()
        duplicate = False
        for old in kept:
            if old['text'].strip() != text:
                continue
            if abs(old['cx'] - item['cx']) <= 18 and abs(old['cy'] - item['cy']) <= 12:
                duplicate = True
                break
        if not duplicate:
            kept.append(item)
    return sorted(kept, key=lambda x: (x['cy'], x['left']))


# 将OCR识别项按行分组为多个簇，每簇包含同一行的文本项
def build_line_clusters(items: list[dict], y_gap: float = 18.0) -> list[list[dict]]:
    items = dedupe_items_by_geometry([x for x in items if should_keep_line(x['text'])])
    if not items:
        return []
    clusters: list[list[dict]] = []
    current = [items[0]]
    for item in items[1:]:
        prev = current[-1]
        avg_h = max(10.0, (prev.get('height', 0) + item.get('height', 0)) / 2.0)
        gap_limit = max(y_gap, avg_h * 1.15)
        if abs(item['cy'] - prev['cy']) <= gap_limit:
            current.append(item)
        else:
            clusters.append(current)
            current = [item]
    clusters.append(current)
    return clusters


# 将同一行的OCR项合并为单行文本字符串
def cluster_to_lines(cluster: list[dict]) -> list[str]:
    rows = build_line_clusters(cluster, y_gap=10.0)
    lines = []
    for row in rows:
        row = dedupe_items_by_geometry(sorted(row, key=lambda x: x['left']))
        parts = []
        for item in row:
            text = item['text'].strip()
            if not text:
                continue
            if parts and parts[-1] == text:
                continue
            parts.append(text)
        merged = ''.join(parts)
        if merged:
            lines.append(merged)
    return dedupe_lines(lines)


# 计算文本簇的边界框（上下左右和中心点）
def cluster_bounds(cluster: list[dict]) -> dict:
    return {
        'top': min(x['top'] for x in cluster),
        'bottom': max(x['bottom'] for x in cluster),
        'left': min(x['left'] for x in cluster),
        'right': max(x['right'] for x in cluster),
        'cx': sum(x['cx'] for x in cluster) / len(cluster),
    }


# 根据簇内文本项估算单行文字的高度
def estimate_cluster_line_height(cluster: list[dict]) -> float:
    heights = [float(x.get('height', 0) or 0) for x in cluster if float(x.get('height', 0) or 0) > 0]
    if not heights:
        return 18.0
    return max(14.0, float(np.median(heights)))


# 根据文本簇边界裁剪出气泡区域图片，使用颜色和形态学方法定位气泡
def crop_bubble_box(img: Image.Image, cluster_items: list[dict], stem: str | None = None) -> tuple[Image.Image | None, dict | None]:
    if not cluster_items:
        return None, None

    arr = np.array(img.convert('RGB'))
    img_h, img_w = arr.shape[:2]
    bounds = cluster_bounds(cluster_items)

    seed_left = max(0, int(bounds['left'] - 20))
    seed_right = min(img_w, int(bounds['right'] + 20))
    seed_top = max(0, int(bounds['top'] - 30))
    seed_bottom = min(img_h, int(bounds['bottom'] + 14))
    if seed_right <= seed_left or seed_bottom <= seed_top:
        return None, None

    seed = arr[seed_top:seed_bottom, seed_left:seed_right]
    if seed.size == 0:
        return None, None

    # 对方消息气泡：优先锚定到 #EEEEF0 附近；自己消息可再走别的颜色策略
    seed_pixels = seed.reshape(-1, 3).astype(np.int16)
    brightness = seed_pixels.mean(axis=1)
    bubble_pixels = seed_pixels[brightness < BRIGHTNESS_CUTOFF]
    if bubble_pixels.size == 0:
        bubble_pixels = seed_pixels
    median_target = np.median(bubble_pixels, axis=0)
    peer_target = np.array(PEER_BUBBLE_RGB, dtype=np.int16)  # #EEEEF0
    # 左侧消息优先锚定 #EEEEF0 的灰底黑字气泡；右侧保留原有自适应策略
    is_left_bubble = bounds['cx'] <= img_w * LEFT_BUBBLE_CX_RATIO
    if is_left_bubble:
        target = peer_target
        threshold = LEFT_COLOR_THRESHOLD
    else:
        target = median_target.astype(np.int16)
        threshold = RIGHT_COLOR_THRESHOLD

    diff = np.abs(arr.astype(np.int16) - target.reshape(1, 1, 3))
    dist = diff.sum(axis=2)

    bg_mask = (arr[:, :, 0] > BG_WHITE_THRESHOLD) & (arr[:, :, 1] > BG_WHITE_THRESHOLD) & (arr[:, :, 2] > BG_WHITE_THRESHOLD)
    bright_mask = arr.mean(axis=2) > BRIGHT_MASK_THRESHOLD
    color_mask = (dist <= threshold) & (~bg_mask) & (~bright_mask)
    color_mask = color_mask.astype(np.uint8) * 255

    if is_left_bubble:
        # 左侧灰气泡更怕把上下两条消息连在一起，这里弱化纵向粘连、强化横向连通
        kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (11, 3))
        kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    else:
        kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (7, 7))
        kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    color_mask = cv2.morphologyEx(color_mask, cv2.MORPH_CLOSE, kernel_close)
    color_mask = cv2.morphologyEx(color_mask, cv2.MORPH_OPEN, kernel_open)

    text_cx = bounds['cx']
    text_cy = (bounds['top'] + bounds['bottom']) / 2.0
    text_h = max(1.0, bounds['bottom'] - bounds['top'])
    text_w = max(1.0, bounds['right'] - bounds['left'])

    def is_probable_left_avatar_candidate(stats_arr, label_idx: int, x: int, y: int, w: int, h: int) -> bool:
        if not is_left_bubble:
            return False
        if x > img_w * 0.2:
            return False
        if w <= 0 or h <= 0:
            return False
        wh_ratio = w / max(1.0, float(h))
        if wh_ratio < 0.72 or wh_ratio > 1.35:
            return False
        if w < max(18.0, text_h * 0.55) or w > max(92.0, text_h * 2.4):
            return False

        right_edge = x + w
        row_hit = False
        n = stats_arr.shape[0] if hasattr(stats_arr, 'shape') else len(stats_arr)
        for j in range(1, n):
            if j == label_idx:
                continue
            ox, oy, ow, oh, oarea = stats_arr[j]
            if oarea <= 0 or ow <= 0 or oh <= 0:
                continue
            if ox <= right_edge:
                continue
            gap = ox - right_edge
            if gap < 2 or gap > max(120.0, text_w * 1.4):
                continue
            overlap_h = max(0.0, min(y + h, oy + oh) - max(y, oy))
            if overlap_h < min(h, oh) * 0.45:
                continue
            if ow < w * 1.15:
                continue
            if oh < h * 0.75:
                continue
            row_hit = True
            break
        return row_hit

    def pick_box_from_binary_merge() -> tuple[int, int, int, int] | None:
        merged_out_dir = get_image_out_dir(stem or 'unknown', 'merged') if stem else None
        merged_img = build_merged_binary_image(img, is_left_bubble=is_left_bubble)
        merged = np.array(merged_img)

        if merged_out_dir is not None:
            merged_img.save(merged_out_dir / 'bubble_bin.png')

        num_labels2, labels2, stats2, _ = cv2.connectedComponentsWithStats(merged, 8)
        best_local = None
        for label in range(1, num_labels2):
            x, y, w, h, area = stats2[label]
            if area <= 0 or w <= 0 or h <= 0:
                continue
            if is_probable_left_avatar_candidate(stats2, label, x, y, w, h):
                continue
            right = x + w
            bottom = y + h
            if not (x <= text_cx <= right and y <= text_cy <= bottom):
                continue

            top_gap = max(0.0, bounds['top'] - y)
            bottom_gap = max(0.0, bottom - bounds['bottom'])
            overlap_x = max(0.0, min(bounds['right'], right) - max(bounds['left'], x))
            overlap_ratio = overlap_x / text_w if text_w > 0 else 0.0
            if overlap_ratio < 0.45:
                continue

            score = 0.0
            score += bottom * BIN_SCORE_BOTTOM
            score += area * BIN_SCORE_AREA
            score += overlap_ratio * BIN_SCORE_OVERLAP
            if y < img_h * 0.18:
                score -= (img_h * 0.18 - y) * 2.0
            score -= top_gap * (BIN_PENALTY_TOP_GAP_LEFT if is_left_bubble else BIN_PENALTY_TOP_GAP_RIGHT)
            score -= abs(bottom_gap - 12.0) * BIN_PENALTY_BOTTOM_GAP

            max_reasonable_h = max(120.0, text_h * 3.4)
            max_reasonable_w = max(360.0, text_w * 1.9)
            if h > max_reasonable_h:
                score -= (h - max_reasonable_h) * BIN_PENALTY_TOO_TALL
            if w > max_reasonable_w:
                score -= (w - max_reasonable_w) * BIN_PENALTY_TOO_WIDE

            if is_left_bubble:
                region = arr[y:bottom, x:right].reshape(-1, 3).astype(np.int16)
                if region.size > 0:
                    region_mean = np.median(region, axis=0)
                    color_score = float(320 - np.abs(region_mean - peer_target).sum())
                    score += max(0.0, color_score)

            if best_local is None or score > best_local[0]:
                best_local = (score, x, y, right, bottom)
        if best_local is None:
            return None
        return best_local[1], best_local[2], best_local[3], best_local[4]

    best = None
    binary_box = pick_box_from_binary_merge()
    if binary_box is not None:
        left, top, right, bottom = binary_box
        best = (float(bottom), left, top, right, bottom)
    else:
        num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(color_mask, 8)
        for label in range(1, num_labels):
            x, y, w, h, area = stats[label]
            if area <= 0 or w <= 0 or h <= 0:
                continue
            if is_probable_left_avatar_candidate(stats, label, x, y, w, h):
                continue
            right = x + w
            bottom = y + h
            if not (x <= text_cx <= right and y <= text_cy <= bottom):
                continue

            top_gap = max(0.0, bounds['top'] - y)
            bottom_gap = max(0.0, bottom - bounds['bottom'])
            overlap_x = max(0.0, min(bounds['right'], right) - max(bounds['left'], x))
            overlap_ratio = overlap_x / text_w if text_w > 0 else 0.0

            score = 0.0
            score += area * COLOR_SCORE_AREA
            score += bottom * COLOR_SCORE_BOTTOM
            score += overlap_ratio * COLOR_SCORE_OVERLAP
            if y < img_h * OCR_TOP_PENALTY_RATIO:
                score -= (img_h * OCR_TOP_PENALTY_RATIO - y) * OCR_TOP_PENALTY_COLOR_FACTOR

            if is_left_bubble:
                region = arr[y:bottom, x:right].reshape(-1, 3).astype(np.int16)
                region_mean = np.median(region, axis=0)
                color_score = float(320 - np.abs(region_mean - peer_target).sum())
                score += max(0.0, color_score)
                # 目标是最后一片灰框，不是整列灰底
                score -= top_gap * COLOR_PENALTY_TOP_GAP_LEFT
                score -= abs(bottom_gap - 12.0) * COLOR_PENALTY_BOTTOM_GAP
                max_reasonable_h = max(110.0, text_h * 2.8)
                if h > max_reasonable_h:
                    score -= (h - max_reasonable_h) * COLOR_PENALTY_TOO_TALL
                max_reasonable_w = max(320.0, text_w * 1.6)
                if w > max_reasonable_w:
                    score -= (w - max_reasonable_w) * COLOR_PENALTY_TOO_WIDE
            else:
                score += top_gap * COLOR_SCORE_TOP_GAP_RIGHT
                score += h * COLOR_SCORE_HEIGHT_RIGHT

            if best is None or score > best[0]:
                best = (score, x, y, right, bottom)

    if best is not None:
        _, left, top, right, bottom = best

        if is_left_bubble:
            cand_w = max(1, int(right - left))
            cand_h = max(1, int(bottom - top))
            cand_ratio = cand_w / float(cand_h)
            if left <= int(img_w * 0.2) and 0.70 <= cand_ratio <= 1.35 and cand_w <= max(96, int(text_h * 2.6)):
                left = max(0, int(bounds['left'] - 6))
                right = min(img_w, int(bounds['right'] + 12))
                top = max(0, int(bounds['top'] - 4))
                bottom = min(img_h, int(bounds['bottom'] + 10))

        # 基于已找到的色块做方向扩张：左侧气泡重点往左/上/下扩
        def col_match_ratio(x: int, y1: int, y2: int) -> float:
            seg = dist[max(0, y1):min(img_h, y2), max(0, x):min(img_w, x + 1)]
            if seg.size == 0:
                return 0.0
            return float((seg <= threshold).mean())

        def row_match_ratio(y: int, x1: int, x2: int) -> float:
            seg = dist[max(0, y):min(img_h, y + 1), max(0, x1):min(img_w, x2)]
            if seg.size == 0:
                return 0.0
            return float((seg <= threshold).mean())

        is_left_bubble = bounds['cx'] <= img_w * 0.58
        line_h = estimate_cluster_line_height(cluster_items)
        current_h = max(1.0, float(bottom - top))
        # 不写死目标高度，按当前识别到的文字行高推测一个更合理的气泡高度。
        # 经验上 4 行消息块通常需要约 5~6 个文本行高的容纳空间；
        # 当前框偏小时，优先把增量用于向上扩展。
        desired_h = max(current_h, line_h * 5.6)
        need_more_h = max(0.0, desired_h - current_h)

        # 左侧消息按“红框风格”收紧：更贴正文，避免包太大
        expand_left_limit = 12 if is_left_bubble else 30
        expand_right_limit = 22 if is_left_bubble else 90
        expand_up_limit = max(6, int(min(18, line_h * 0.55 + need_more_h * 0.08))) if is_left_bubble else max(40, int(min(120, line_h * 3.8 + need_more_h * 0.9)))
        expand_down_limit = max(10, int(min(28, line_h * 0.85 + need_more_h * 0.10))) if is_left_bubble else max(20, int(min(60, line_h * 1.6 + need_more_h * 0.25)))

        for _ in range(expand_left_limit):
            if left <= 1:
                break
            ratio = col_match_ratio(left - 1, top, bottom)
            if ratio < 0.18:
                break
            left -= 1

        for _ in range(expand_right_limit):
            if right >= img_w - 1:
                break
            ratio = col_match_ratio(right, top, bottom)
            if ratio < 0.18:
                break
            right += 1

        up_steps = 0
        top_guard = max(0, int(bounds['top'] - (line_h * 0.9 if is_left_bubble else line_h * 2.2)))
        for _ in range(expand_up_limit):
            if top <= 1 or top <= top_guard:
                break
            ratio = row_match_ratio(top - 1, left, right)
            # 左侧消息向上严格很多，避免把昵称/头像旁白区卷进来
            if is_left_bubble:
                if ratio < 0.18:
                    break
            else:
                if ratio < 0.10 and up_steps > int(line_h * 0.8):
                    break
            top -= 1
            up_steps += 1

        down_steps = 0
        for _ in range(expand_down_limit):
            if bottom >= img_h - 1:
                break
            ratio = row_match_ratio(bottom, left, right)
            if ratio < 0.16 and down_steps > int(line_h * 0.5):
                break
            bottom += 1
            down_steps += 1

        # 如果扩张后仍明显矮于按行高估计的合理高度，左侧消息也只做很有限补偿，避免变成大包围框
        current_h2 = max(1, bottom - top)
        missing_h = max(0, int(desired_h - current_h2))
        if missing_h > 0:
            if is_left_bubble:
                extra_up = min(max(0, int(missing_h * 0.08)), max(0, top - top_guard))
                extra_down = min(max(0, int(missing_h * 0.35)), max(0, img_h - bottom))
            else:
                extra_up = min(max(0, int(missing_h * 0.75)), max(0, top))
                extra_down = min(max(0, int(missing_h * 0.25)), max(0, img_h - bottom))
            top -= extra_up
            bottom += extra_down

        # 左侧消息最终再做一次边界钳制，逼近你标的红框大小
        if is_left_bubble:
            target_left = int(bounds['left'] - 6)
            target_top = int(bounds['top'] - max(3, line_h * 0.28))
            target_right = int(bounds['right'] + 10)
            target_bottom = int(bounds['bottom'] + max(8, line_h * 0.45))
            left = max(0, min(int(left), target_left))
            left = max(left, max(0, target_left - 2))
            top = max(0, min(int(top), target_top))
            top = max(top, max(0, target_top - 2))
            right = min(img_w, max(int(right), target_right))
            right = min(right, target_right + 4)
            bottom = min(img_h, max(int(bottom), target_bottom))
            bottom = min(bottom, target_bottom + 4)
        else:
            left = max(0, int(left - 4))
            top = max(0, int(top - 4))
            right = min(img_w, int(right + 4))
            bottom = min(img_h, int(bottom + 4))
    else:
        # 找不到色块时才退回文本框轻微扩边；左侧消息按正文贴边框处理
        if bounds['cx'] <= img_w * 0.58:
            left = max(0, int(bounds['left'] - 6))
            right = min(img_w, int(bounds['right'] + 12))
            top = max(0, int(bounds['top'] - 4))
            bottom = min(img_h, int(bounds['bottom'] + 10))
        else:
            left = max(0, int(bounds['left'] - 24))
            right = min(img_w, int(bounds['right'] + 26))
            top = max(0, int(bounds['top'] - 42))
            bottom = min(img_h, int(bounds['bottom'] + 22))

    if right <= left or bottom <= top:
        return None, None
    box = {'left': left, 'top': top, 'right': right, 'bottom': bottom, 'width': right - left, 'height': bottom - top}
    return img.crop((left, top, right, bottom)), box


# 对气泡图片进行OCR识别，返回识别到的文本行列表
def ocr_bubble_text(bubble_img: Image.Image, stem: str) -> list[str]:
    scale = 5
    scaled_img = bubble_img.resize((bubble_img.width * scale, bubble_img.height * scale), Image.Resampling.LANCZOS)
    raw_lines = ocr_lines(scaled_img, f'clicked_{stem}_bubble_crop')

    def normalize(lines: list[str]) -> list[str]:
        merged = []
        for line in lines:
            text = str(line).strip()
            if not should_keep_line(text):
                continue
            if text not in merged:
                merged.append(text)
        return merged

    def build_bold_binary_image(source_img: Image.Image) -> Image.Image:
        gray = np.array(source_img.convert('L'))
        _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        binary_img = Image.fromarray(binary).convert('L')
        scaled_binary = binary_img.resize((source_img.width * scale, source_img.height * scale), Image.Resampling.NEAREST)
        arr = np.array(scaled_binary)
        kernel = np.ones((2, 2), np.uint8)
        dilated = cv2.dilate(255 - arr, kernel, iterations=1)
        return Image.fromarray(255 - dilated).convert('RGB')

    def recognize_joined_short_text(source_img: Image.Image) -> str | None:
        items = ocr_items_direct(source_img)
        if len(items) < 2:
            return None
        items.sort(key=lambda item: (item['left'], item['top']))
        text = ''.join(item['text'] for item in items if should_keep_line(item['text']))
        if not text:
            return None
        if len(text) <= 8 and re.fullmatch(r'[一-鿿A-Za-z0-9]+', text):
            return text
        return None

    def looks_like_short_name(text: str) -> bool:
        text = (text or '').strip()
        if not text:
            return False
        if len(text) > 4:
            return False
        if any(ch in text for ch in '，。！？!?.、/@'):
            return False
        return re.fullmatch(r'[一-鿿A-Za-z0-9]+', text) is not None

    def looks_like_headerish(text: str) -> bool:
        text = (text or '').strip()
        if not text:
            return False
        if looks_like_short_name(text):
            return True
        if ('+' in text and re.search(r'\d', text)) or '拍立减' in text or '限时' in text:
            return True
        return False

    def choose_best_suffix(lines: list[str]) -> list[str]:
        lines = cleanup_candidate_lines(lines)
        if not lines:
            return []
        suffixes = [lines[i:] for i in range(len(lines))]
        def suffix_score(seq: list[str]) -> tuple[float, float, float]:
            total_chars = sum(len(x) for x in seq)
            score = total_chars * 0.5
            score -= max(0, len(seq) - 4) * 12
            score -= len(seq) * 2
            first = seq[0]
            last = seq[-1]
            if looks_like_headerish(first) and len(seq) > 1:
                score -= 35
            if len(first) <= 8 and len(seq) > 1 and len(seq[1]) >= 8:
                score -= 26
            if looks_like_short_name(last) and len(seq) > 1:
                score -= 55
            if len(last) <= 3:
                score -= 40
            if len(seq) == 1 and len(last) >= 2:
                score += 14
            return (score, -len(seq), total_chars)
        best = max(suffixes, key=suffix_score)
        return cleanup_candidate_lines(best)

    raw_norm = normalize(raw_lines)
    should_try_short_text = bubble_img.width <= 90 and bubble_img.height <= 60 and (not raw_norm or len(''.join(raw_norm)) <= 2)
    if should_try_short_text:
        bold_binary_img = build_bold_binary_image(bubble_img)
        bold_binary_img.save(get_image_out_dir(stem, 'ocr') / 'bubble_crop_bold_binary_5x.png')
        joined_text = recognize_joined_short_text(bold_binary_img)
        if joined_text and (not raw_norm or len(joined_text) > len(''.join(raw_norm))):
            return [joined_text]

    if raw_norm:
        return raw_norm
    return []


# 从OCR识别项中选择最新的消息簇，返回文本和原始簇
def pick_latest_cluster(items: list[dict], image_w: int, image_h: int) -> tuple[list[str], list[str], list[dict]]:
    if not items:
        return [], [], []
    clusters = build_line_clusters(items, y_gap=max(18.0, image_h * 0.018))
    if not clusters:
        return [], [], []

    scored = []
    for idx, cluster in enumerate(clusters):
        bounds = cluster_bounds(cluster)
        lines = cluster_to_lines(cluster)
        if not lines:
            continue
        height = bounds['bottom'] - bounds['top']
        width = bounds['right'] - bounds['left']
        text_len = sum(len(x) for x in lines)
        score = 0.0
        score += bounds['bottom'] * CLUSTER_SCORE_BOTTOM
        score += min(120.0, height * CLUSTER_SCORE_HEIGHT)
        score += min(80.0, width * CLUSTER_SCORE_WIDTH)
        score += min(60.0, text_len * CLUSTER_SCORE_TEXT_LEN)
        if len(lines) >= 2:
            score += CLUSTER_SCORE_MULTI_LINE_BONUS

        is_left_cluster = bounds['cx'] <= image_w * LEFT_BUBBLE_CX_RATIO and bounds['left'] <= image_w * RIGHT_BUBBLE_MIN_LEFT_RATIO
        if not is_left_cluster:
            continue
        min_line_len = min((len(x.strip()) for x in lines if x and x.strip()), default=0)
        if is_left_cluster and len(lines) == 1 and min_line_len <= 4:
            near_bottom = bounds['bottom'] >= image_h * 0.62
            has_cjk = any(re.search(r'[一-鿿]', x or '') for x in lines)
            if not (near_bottom and has_cjk):
                score -= 120.0
        if is_left_cluster and width <= max(52.0, height * 1.25):
            near_bottom = bounds['bottom'] >= image_h * 0.62
            has_cjk = any(re.search(r'[一-鿿]', x or '') for x in lines)
            if not (near_bottom and has_cjk):
                score -= 140.0

        scored.append((score, idx, lines, cluster))

    if not scored:
        return [], [], []
    scored.sort(key=lambda x: x[0], reverse=True)
    _, _, best_lines, best_cluster = scored[0]

    # 为了优先保证截图只落在最后一条正文块，这里先不向上合并前一个 cluster。
    return list(best_lines)[:4], dedupe_lines([x['text'] for x in items]), list(best_cluster)


# 从聊天快照中提取最新消息文本
def extract_latest_text(img: Image.Image, stem: str, preferred_ocr_img: Image.Image | None = None, preferred_crop_box: dict | None = None) -> tuple[str | None, list[str], list[dict], dict | None]:
    img_np = np.array(img)
    h, w = img_np.shape[:2]
    all_items: list[dict] = []
    merged_out_dir = get_image_out_dir(stem, 'merged')
    ocr_out_dir = get_image_out_dir(stem, 'ocr')

    merged_full = build_merged_binary_image(img)
    merged_full.save(merged_out_dir / 'full_bin.png')

    raw_items = ocr_items_with_boxes(img, offset_x=0, offset_y=0)
    merged_items = dedupe_items_by_geometry(raw_items)
    for item in merged_items:
        text = item['text'].strip()
        if should_keep_line(text):
            all_items.append(item)

    cluster_lines, valid_lines, cluster_items = pick_latest_cluster(all_items, w, h)
    bubble_img, bubble_box = crop_bubble_box(img, cluster_items, stem)
    ocr_source_img = preferred_ocr_img if preferred_ocr_img is not None else bubble_img
    crop_box = preferred_crop_box if preferred_ocr_img is not None else bubble_box
    if ocr_source_img is not None:
        ocr_source_img.save(ocr_out_dir / 'bubble_crop.png')
        bubble_lines = ocr_bubble_text(ocr_source_img, stem)
        bubble_lines = cleanup_candidate_lines(bubble_lines)
        if bubble_lines:
            return '\n'.join(bubble_lines), valid_lines, cluster_items, crop_box
    latest_block_lines = list(cluster_lines)
    latest_block_lines = cleanup_candidate_lines(latest_block_lines)
    latest_block = '\n'.join(latest_block_lines) if latest_block_lines else None
    return latest_block, valid_lines, cluster_items, crop_box


# 检测最新消息气泡位于屏幕左侧还是右侧（判断是自己还是对方的消息）
def detect_latest_bubble_side(img: Image.Image, cluster_items: list[dict], crop_box: dict | None = None) -> tuple[str | None, float]:
    arr = np.array(img.convert('RGB'))
    h, w = arr.shape[:2]

    if crop_box:
        left = float(crop_box.get('left', 0))
        right = float(crop_box.get('right', left))
        center_x = (left + right) / 2.0
        if center_x >= w * LEFT_BUBBLE_CX_RATIO or left >= w * RIGHT_BUBBLE_MIN_LEFT_RATIO:
            return 'right', 0.9
        return 'left', 0.9

    if cluster_items:
        avg_cx = sum(x['cx'] for x in cluster_items) / len(cluster_items)
        min_left = min(x['left'] for x in cluster_items)
        max_right = max(x['right'] for x in cluster_items)
        width = max_right - min_left
        if avg_cx >= w * LEFT_BUBBLE_CX_RATIO or min_left >= w * RIGHT_BUBBLE_MIN_LEFT_RATIO:
            return 'right', 0.84 if width > w * 0.18 else 0.78
        if avg_cx <= w * 0.52 or max_right <= w * LEFT_SIDE_MAX_RIGHT_RATIO:
            return 'left', 0.84 if width > w * 0.18 else 0.78

    focus = arr[int(h * 0.38):, :, :]
    hsv = cv2.cvtColor(focus, cv2.COLOR_RGB2HSV)
    green_mask = cv2.inRange(hsv, np.array([35, 25, 80]), np.array([95, 255, 255]))
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
    green_mask = cv2.morphologyEx(green_mask, cv2.MORPH_CLOSE, kernel)
    contours, _ = cv2.findContours(green_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    candidates = []
    y_offset = int(h * 0.38)
    for cnt in contours:
        x, y, ww, hh = cv2.boundingRect(cnt)
        area = ww * hh
        right = x + ww
        if area < 2500:
            continue
        if hh < 28 or ww < 80:
            continue
        if right < w * 0.66:
            continue
        if x < w * 0.45 and ww > w * 0.45:
            continue
        candidates.append((y + y_offset, x, ww, hh, area))

    if candidates:
        candidates.sort(key=lambda item: (item[0] + item[3], item[1]))
        y, x, ww, hh, area = candidates[-1]
        center_x = x + ww / 2.0
        side = 'right' if center_x >= w * LEFT_BUBBLE_CX_RATIO else 'left'
        confidence = min(0.72, 0.50 + min(0.22, area / 40000.0))
        return side, confidence

    return 'left', 0.35


# 分析PIL图片的主函数，提取最新消息、判断发送方并返回分析结果
def analyze_pil_image(img: Image.Image, stem: str, file_name: str | None = None) -> AnalyzeResult:
    trace_id = new_trace_id("bot")
    safe_stem = stem or 'unknown'
    ocr_out_dir = get_image_out_dir(safe_stem, 'ocr')
    for p in ocr_out_dir.glob('*.png'):
        p.unlink(missing_ok=True)

    img = img.convert('RGB')
    original_out_dir = get_image_out_dir(safe_stem, 'original')
    img.save(original_out_dir / 'full_raw.png')

    merged_full = build_merged_binary_image(img)
    last_white_box = pick_last_white_box_from_merged_binary(merged_full)
    preferred_ocr_img = None
    preferred_crop_box = None
    if last_white_box is not None:
        preferred_crop_box = dict(last_white_box)
        preferred_ocr_img = img.crop((
            int(last_white_box['left']),
            int(last_white_box['top']),
            int(last_white_box['right']),
            int(last_white_box['bottom']),
        ))
        preferred_ocr_img.save(original_out_dir / 'last_white_box_raw.png')

    latest_text, valid_lines, cluster_items, crop_box = extract_latest_text(img, safe_stem, preferred_ocr_img=preferred_ocr_img, preferred_crop_box=preferred_crop_box)
    bubble_side, confidence = detect_latest_bubble_side(img, cluster_items, crop_box=crop_box)
    if bubble_side == 'right':
        latest_text = None
        valid_lines = []
    is_self_sent = None if bubble_side is None else (bubble_side == 'right')
    result = AnalyzeResult(
        file=file_name or f'{safe_stem}.png',
        size=img.size,
        crop_box=crop_box,
        latest_text=latest_text,
        is_self_sent=is_self_sent,
        bubble_side=bubble_side,
        confidence=confidence,
        valid_lines=valid_lines,
        error=None,
    )
    (ocr_out_dir / 'result.json').write_text(json.dumps(asdict(result), ensure_ascii=False, indent=2), encoding='utf-8')
    log_event("INFO", "bot", "bot.chat_snapshot", trace_id, "analyze", "ok", "聊天快照分析完成", extra={"file": result.file, "has_text": bool(result.latest_text), "bubble_side": result.bubble_side or "", "confidence": result.confidence})
    return result