委员会查询:用集成模型分歧发现关键样本
FreeGuideOnline
最新
2026-06-27
python import numpy as np from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.svm import SVC from sklearn.metrics import accuracy_score from scipy.stats import entropy
加载数据
data = load_breast_cancer() X, y = data.data, data.target
切分:保留少量初始标签,大量作为未标注池
X_train, X_unlabeled, y_train, y_unlabeled = train_test_split( X, y, test_size=0.95, random_state=42, stratify=y )
模拟真实场景:未标注池的标签我们暂时不用(评估时除外)
print(f"初始标注样本数: {len(y_train)}, 未标注池大小: {len(X_unlabeled)}")
### 4.2 构建异质委员会
委员会成员必须保持多样性。这里我们选择随机森林、梯度提升树和 SVM 作为三个成员。为了让 SVM 输出概率,我们将 `probability=True`。
```python
def create_committee():
return [
("RF", RandomForestClassifier(n_estimators=50, random_state=0)),
("GB", GradientBoostingClassifier(n_estimators=50, random_state=0)),
("SVM", SVC(kernel="rbf", probability=True, random_state=0))
]
4.3 计算投票熵分歧
实现一个基于投票熵的查询函数。对未标注池中的每一个样本,委员会成员分别预测类别,然后计算投票分布的熵。
def vote_entropy_query(committee, X_pool, n_instances=10):
# 收集所有成员的类别预测 (n_committee, n_samples)
votes = np.array([estimator.predict(X_pool) for _, estimator in committee])
# 转置为 (n_samples, n_committee)
votes = votes.T
entropies = []
for sample_votes in votes:
# 计算每种票的计数
_, counts = np.unique(sample_votes, return_counts=True)
# 投票比例分布
prob = counts / len(committee)
ent = entropy(prob, base=2) # 以2为底,也可用自然对数
entropies.append(ent)
# 选取熵最大的前 n_instances 个样本的索引
query_idx = np.argsort(entropies)[-n_instances:]
return query_idx
4.4 主动学习循环
我们进行多轮迭代,每轮选择 10 个争议样本添加到训练集,观察测试集上的准确率变化(此处用固定的留出集做评估,实际项目中建议使用验证集)。
# 留出固定的测试集用于最终评估
X_train_static, X_test, y_train_static, y_test = train_test_split(
X, y, test_size=0.2, random_state=7
)
# 初始委员会训练
committee = create_committee()
for _, est in committee:
est.fit(X_train_static, y_train_static)
# 模拟未标注池(实际中我们假装没有标签)
X_pool = X_unlabeled.copy()
y_pool_true = y_unlabeled.copy() # 仅用于模拟标注
acc_history = []
n_queries = 20
batch_size = 10
for round_idx in range(n_queries):
# 查询10个最高分歧样本
query_indices = vote_entropy_query(committee, X_pool, batch_size)
# 获取这些样本的“真相”(模拟人工标注过程)
X_new = X_pool[query_indices]
y_new = y_pool_true[query_indices]
# 添加至训练集
X_train_static = np.vstack([X_train_static, X_new])
y_train_static = np.concatenate([y_train_static, y_new])
# 从未标注池中移除已选样本
X_pool = np.delete(X_pool, query_indices, axis=0)
y_pool_true = np.delete(y_pool_true, query_indices, axis=0)
# 重新训练委员会所有成员
for _, est in committee:
est.fit(X_train_static, y_train_static)
# 评估:用简单投票法集成的准确率
preds = np.array([est.predict(X_test) for _, est in committee])
ensemble_pred = np.apply_along_axis(
lambda x: np.bincount(x).argmax(), axis=0, arr=preds
)
acc = accuracy_score(y_test, ensemble_pred)
acc_history.append(acc)
print(f"第 {round_idx+1} 轮查询后 测试准确率: {acc:.4f}")