知识蒸馏训练策略:从离线到在线蒸馏的时序设计

FreeGuideOnline 最新 2026-06-27

python

伪代码:三阶段时序蒸馏

for epoch in range(total_epochs): if epoch < warmup_epochs: # 仅真实标签训练学生 alpha = 1.0 update_teacher = False elif epoch < offline_distill_epochs: # 教师冻结,离线蒸馏 alpha = 0.5 teacher.eval() update_teacher = False else: # 在线协同训练 alpha = schedule(epoch) # 动态调整 teacher.train() update_teacher = True

for data, target in loader:
    # 学生前向
    s_out = student(data)
    loss = alpha * CE(s_out, target)

    if alpha < 1.0:
        with torch.no_grad() if not update_teacher else context:
            t_out = teacher(data)
        soft_loss = KL(F.log_softmax(s_out/tau), 
                       F.softmax(t_out/tau, dim=1))
        loss += (1-alpha) * soft_loss

    loss.backward()
    optimizer_student.step()

    if update_teacher:
        # 同步或EMA更新教师
        update_teacher_params()