AutoInt：用自注意力学习特征间的高阶交互

FreeGuideOnline 最新 2026-06-24

python import torch import torch.nn as nn import torch.nn.functional as F

模拟数据：batch_size=4, 3个特征域，每个域的特征数分别为100,50,20

batch_size = 4 field_dims = [100, 50, 20] # 每个特征域的词汇量 embed_dim = 16 # 嵌入维度 num_fields = len(field_dims)

随机输入，每个特征域的值是相应词汇量下的int

x = torch.randint(0, max(field_dims), (batch_size, num_fields)) print(x.shape) # [4, 3]


### 2. 嵌入层与位置编码

```python
class FeaturesEmbedding(nn.Module):
    def __init__(self, field_dims, embed_dim):
        super().__init__()
        self.embedding = nn.ModuleList([
            nn.Embedding(dim, embed_dim) for dim in field_dims
        ])

    def forward(self, x):
        # x: [batch_size, num_fields]
        embs = [self.embedding[i](x[:, i]) for i in range(len(field_dims))]
        # 每个嵌入是 [batch_size, embed_dim]，stack成 [batch_size, num_fields, embed_dim]
        return torch.stack(embs, dim=1)

位置编码采用可学习的参数：

class PositionalEncoding(nn.Module):
    def __init__(self, num_fields, embed_dim):
        super().__init__()
        self.pos_embedding = nn.Parameter(torch.randn(num_fields, embed_dim))

    def forward(self, x):
        # x: [batch_size, num_fields, embed_dim]
        return x + self.pos_embedding.unsqueeze(0)

3. 多头自注意力交互层

省略FFN，直接使用多头注意力+残差+LayerNorm。

class MultiHeadAttentionInteraction(nn.Module):
    def __init__(self, d_model, n_heads, dropout=0.1):
        super().__init__()
        assert d_model % n_heads == 0
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads

        self.W_Q = nn.Linear(d_model, d_model)
        self.W_K = nn.Linear(d_model, d_model)
        self.W_V = nn.Linear(d_model, d_model)
        self.W_O = nn.Linear(d_model, d_model)

        self.norm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x: [batch_size, num_fields, d_model]
        residual = x
        batch_size, n, _ = x.size()

        Q = self.W_Q(x).view(batch_size, n, self.n_heads, self.d_k).transpose(1,2)  # [B, h, n, d_k]
        K = self.W_K(x).view(batch_size, n, self.n_heads, self.d_k).transpose(1,2)
        V = self.W_V(x).view(batch_size, n, self.n_heads, self.d_k).transpose(1,2)

        # 缩放点积注意力
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)  # [B, h, n, n]
        attn_weights = F.softmax(attn_scores, dim=-1)
        attn_weights = self.dropout(attn_weights)

        attn_output = torch.matmul(attn_weights, V)  # [B, h, n, d_k]
        # 将多头拼起来
        attn_output = attn_output.transpose(1,2).contiguous().view(batch_size, n, self.d_model)
        attn_output = self.W_O(attn_output)

        out = self.norm(residual + attn_output)
        return out

可以堆叠多个交互层：

class InteractionStack(nn.Module):
    def __init__(self, num_layers, d_model, n_heads, dropout=0.1):
        super().__init__()
        self.layers = nn.ModuleList([
            MultiHeadAttentionInteraction(d_model, n_heads, dropout) for _ in range(num_layers)
        ])

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

4. 输出层（含一阶线性部分）

一阶逻辑回归部分直接对原始稀疏特征加权，与自注意力交互结果相加。

class AutoInt(nn.Module):
    def __init__(self, field_dims, embed_dim, num_heads, num_layers, dropout=0.1):
        super().__init__()
        self.embedding = FeaturesEmbedding(field_dims, embed_dim)
        self.pos_encoding = PositionalEncoding(len(field_dims), embed_dim)
        self.interaction = InteractionStack(num_layers, embed_dim, num_heads, dropout)
        # 一阶线性部分
        self.linear = nn.Embedding(sum(field_dims), 1)
        self.bias = nn.Parameter(torch.zeros(1))
        # 最终的加权组合
        self.fc = nn.Linear(embed_dim * len(field_dims), 1)

        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, nn.Embedding):
                nn.init.normal_(m.weight, 0, 0.01)

    def forward(self, x):
        # x: [batch_size, num_fields]
        # 线性部分
        linear_part = self.linear(x).sum(dim=1) + self.bias  # [B, 1]

        # 自注意力交互部分
        emb = self.embedding(x)           # [B, n, d]
        emb = self.pos_encoding(emb)
        inter_out = self.interaction(emb) # [B, n, d]
        # 展平所有特征域的表示
        flatten = inter_out.view(inter_out.size(0), -1)  # [B, n*d]
        interaction_part = self.fc(flatten)              # [B, 1]

        # 融合
        logit = linear_part + interaction_part
        return torch.sigmoid(logit.squeeze(1))

测试模型：

model = AutoInt(field_dims, embed_dim=16, num_heads=4, num_layers=2)
print(model)
y = model(x)
print(y.shape)  # [4]