端到端文字识别：统一检测与识别的联合模型

FreeGuideOnline 最新 2026-06-25

python import torch import torch.nn as nn

class SimpleEndToEndSpotter(nn.Module): def init(self, backbone, rpn_head, roi_head, recog_head): super().init() self.backbone = backbone # e.g., ResNet+FPN self.rpn = rpn_head # region proposal network self.roi = roi_head # detection & RoI feature extractor self.recog = recog_head # sequence recognizer

def forward(self, images, targets=None):
    features = self.backbone(images)
    # 检测阶段（可训练）
    proposals, det_loss = self.rpn(features, targets)
    roi_features, detection_loss = self.roi(features, proposals, targets)
    
    # 识别阶段
    if targets is not None:
        # 从roi_features中提取文本实例特征用于识别
        text_features = roi_features[targets['text_roi_indices']]
        recog_loss = self.recog(text_features, targets['text_sequences'])
    else:
        recog_loss = None

    if self.training:
        total_loss = det_loss + detection_loss + recog_loss
        return total_loss
    else:
        # 推理时结合检测结果和识别输出
        detections = self.roi.postprocess(features, proposals)
        for det in detections:
            det['text'] = self.recog.decode(det['feature'])
        return detections