知识图谱实战手把手用PyTorch复现TuckER模型完成链接预测任务知识图谱作为结构化知识的重要载体在智能搜索、推荐系统和问答系统中发挥着关键作用。然而现实中的知识图谱往往存在大量缺失链接如何自动补全这些缺失信息成为学术界和工业界共同关注的焦点。本文将带你从零开始用PyTorch实现TuckER模型——一种基于张量分解的知识图谱补全方法完成链接预测任务。1. 环境准备与数据加载在开始编码之前我们需要搭建合适的开发环境。推荐使用Python 3.8和PyTorch 1.10版本这些组合能提供良好的兼容性和性能表现。conda create -n kg python3.8 conda activate kg pip install torch1.10.0 torchvision0.11.0 torchaudio0.10.0 pip install pandas tqdm numpy我们将使用FB15k-237数据集这是知识图谱链接预测领域的标准基准数据集。它包含14,541个实体和237种关系是原始FB15k数据集的改进版本解决了其中的测试集泄露问题。import pandas as pd def load_data(dataset_path): train pd.read_csv(f{dataset_path}/train.txt, sep\t, headerNone, names[head, relation, tail]) valid pd.read_csv(f{dataset_path}/valid.txt, sep\t, headerNone, names[head, relation, tail]) test pd.read_csv(f{dataset_path}/test.txt, sep\t, headerNone, names[head, relation, tail]) # 构建实体和关系的词汇表 entities set(train[head]).union(set(train[tail])) relations set(train[relation]) return train, valid, test, entities, relations提示FB15k-237数据集可以从https://github.com/TimDettmers/ConvE获取下载后解压到项目目录的data文件夹下。2. TuckER模型架构解析与实现TuckER模型的核心思想源自Tucker张量分解它将知识图谱的三元组(头实体关系尾实体)表示为一个三阶张量并通过分解这个张量来学习实体和关系的嵌入表示。2.1 模型数学原理TuckER模型的评分函数定义为φ(eₛ, r, eₒ) W ×₁ eₛ ×₂ r ×₃ eₒ其中W ∈ ℝ^{dₑ×dᵣ×dₑ} 是核心张量eₛ, eₒ ∈ ℝ^{dₑ} 是头尾实体的嵌入向量r ∈ ℝ^{dᵣ} 是关系的嵌入向量×ₙ 表示n模乘积2.2 PyTorch实现import torch import torch.nn as nn class TuckER(nn.Module): def __init__(self, num_entities, num_relations, entity_dim200, relation_dim200): super(TuckER, self).__init__() self.entity_embedding nn.Embedding(num_entities, entity_dim) self.relation_embedding nn.Embedding(num_relations, relation_dim) # 核心张量W的初始化 self.W nn.Parameter(torch.randn(entity_dim, relation_dim, entity_dim)) # 初始化参数 nn.init.xavier_normal_(self.entity_embedding.weight) nn.init.xavier_normal_(self.relation_embedding.weight) nn.init.xavier_normal_(self.W) self.bce_loss nn.BCELoss() def forward(self, heads, relations, tails): # 获取嵌入向量 e_s self.entity_embedding(heads) # [batch_size, entity_dim] r self.relation_embedding(relations) # [batch_size, relation_dim] e_o self.entity_embedding(tails) # [batch_size, entity_dim] # 计算n模乘积 # W ×₂ r: [entity_dim, relation_dim, entity_dim] × [batch_size, relation_dim] W_r torch.einsum(ijk,bi-bjk, self.W, r) # [batch_size, entity_dim, entity_dim] # (W ×₂ r) ×₁ e_s W_r_e_s torch.einsum(bjk,bj-bk, W_r, e_s) # [batch_size, entity_dim] # ((W ×₂ r) ×₁ e_s) ×₂ e_o score torch.einsum(bk,bk-b, W_r_e_s, e_o) # [batch_size] return torch.sigmoid(score)3. 训练流程与技巧3.1 负采样策略知识图谱中只有正例三元组我们需要生成负例来训练模型。常用的负采样方法包括随机替换头实体或尾实体基于频率的替换更少出现的实体有更高概率被选中对抗性负采样根据当前模型预测结果选择困难负例def generate_negative_samples(batch, num_entities, devicecpu): heads, relations, tails batch batch_size heads.size(0) # 随机选择替换头实体或尾实体 neg_heads heads.clone() neg_tails tails.clone() # 50%概率替换头实体50%概率替换尾实体 mask torch.rand(batch_size, devicedevice) 0.5 random_entities torch.randint(0, num_entities, (batch_size,), devicedevice) neg_heads[mask] random_entities[mask] neg_tails[~mask] random_entities[~mask] return neg_heads, relations, neg_tails3.2 训练循环实现def train(model, train_data, num_epochs100, batch_size128, learning_rate0.001): optimizer torch.optim.Adam(model.parameters(), lrlearning_rate) for epoch in range(num_epochs): model.train() total_loss 0.0 # 随机打乱训练数据 indices torch.randperm(len(train_data)) for i in range(0, len(train_data), batch_size): batch_indices indices[i:ibatch_size] batch train_data[batch_indices] # 正例 pos_scores model(batch[:,0], batch[:,1], batch[:,2]) pos_labels torch.ones_like(pos_scores) # 负例 neg_heads, neg_rels, neg_tails generate_negative_samples(batch, len(model.entity_embedding.weight)) neg_scores model(neg_heads, neg_rels, neg_tails) neg_labels torch.zeros_like(neg_scores) # 合并正负例 all_scores torch.cat([pos_scores, neg_scores]) all_labels torch.cat([pos_labels, neg_labels]) # 计算损失 loss model.bce_loss(all_scores, all_labels) # 反向传播 optimizer.zero_grad() loss.backward() optimizer.step() total_loss loss.item() print(fEpoch {epoch1}, Loss: {total_loss / (len(train_data)//batch_size)})注意在实际应用中建议添加学习率调度器和早停机制来优化训练过程。4. 模型评估与结果分析4.1 评估指标知识图谱链接预测常用的评估指标包括MRR(Mean Reciprocal Rank)正确结果排名的倒数的平均值Hitsk正确结果出现在前k个预测中的比例def evaluate(model, test_data, all_entities, devicecpu): model.eval() ranks [] with torch.no_grad(): for head, relation, tail in test_data: # 预测尾实体 head_tensor torch.tensor([head]*len(all_entities), devicedevice) rel_tensor torch.tensor([relation]*len(all_entities), devicedevice) tail_tensor torch.tensor(list(all_entities), devicedevice) scores model(head_tensor, rel_tensor, tail_tensor) sorted_indices torch.argsort(scores, descendingTrue) # 找到正确尾实体的排名 rank (sorted_indices tail).nonzero().item() 1 ranks.append(rank) # 计算指标 mrr torch.mean(1.0 / torch.tensor(ranks, dtypetorch.float)).item() hits_10 sum(r 10 for r in ranks) / len(ranks) hits_3 sum(r 3 for r in ranks) / len(ranks) hits_1 sum(r 1 for r in ranks) / len(ranks) return {MRR: mrr, Hits1: hits_1, Hits3: hits_3, Hits10: hits_10}4.2 性能优化技巧嵌入维度选择实体嵌入维度通常设置在100-500之间关系嵌入维度可以略小于实体嵌入维度批量归一化 在评分函数计算后添加批量归一化层可以稳定训练过程标签平滑 使用标签平滑技术可以防止模型对训练数据过拟合class TuckERWithBN(nn.Module): def __init__(self, num_entities, num_relations, entity_dim200, relation_dim200): super().__init__() # ... 其他初始化代码同上 ... self.bn nn.BatchNorm1d(1) def forward(self, heads, relations, tails): # ... 前面计算score的代码同上 ... score torch.einsum(bk,bk-b, W_r_e_s, e_o) score self.bn(score.unsqueeze(1)).squeeze(1) return torch.sigmoid(score)5. 高级主题与扩展5.1 多任务学习TuckER的核心张量W可以看作是在不同关系间共享的知识这种结构天然支持多任务学习。我们可以通过添加辅助任务来进一步提升模型性能class MultiTaskTuckER(TuckER): def __init__(self, num_entities, num_relations, entity_dim200, relation_dim200): super().__init__(num_entities, num_relations, entity_dim, relation_dim) self.relation_classifier nn.Linear(relation_dim, num_relations) def forward(self, heads, relations, tails): # 原始链接预测任务 link_scores super().forward(heads, relations, tails) # 关系分类任务 r_emb self.relation_embedding(relations) relation_logits self.relation_classifier(r_emb) return link_scores, relation_logits5.2 模型压缩与部署在实际应用中模型可能需要部署到资源受限的环境。我们可以通过以下技术减小模型大小张量分解对核心张量W进行低秩分解量化将模型参数从FP32转换为INT8知识蒸馏训练一个小型学生模型模仿大型教师模型的行为class CompressedTuckER(nn.Module): def __init__(self, num_entities, num_relations, entity_dim200, relation_dim200, rank50): super().__init__() self.entity_embedding nn.Embedding(num_entities, entity_dim) self.relation_embedding nn.Embedding(num_relations, relation_dim) # 低秩分解核心张量 self.U nn.Parameter(torch.randn(entity_dim, rank)) self.V nn.Parameter(torch.randn(relation_dim, rank)) self.W nn.Parameter(torch.randn(entity_dim, rank)) def forward(self, heads, relations, tails): e_s self.entity_embedding(heads) r self.relation_embedding(relations) e_o self.entity_embedding(tails) # 近似计算核心张量 core torch.einsum(ik,jk,lk-ijl, self.U, self.V, self.W) # 计算评分 score torch.einsum(ijk,bi,bj,bk-b, core, e_s, r, e_o) return torch.sigmoid(score)在实际项目中我发现合理设置嵌入维度和核心张量秩的平衡对模型性能影响很大。通常可以先使用完整模型训练然后通过分析核心张量的奇异值来决定压缩后的秩大小。