ComplEX是Distmul的改进。
前提:采用Bilinear Model,该模型的打分函数为f (h,t)= hT·r·t
基础假设:采用复数u = a + bi的方式表示h和t。
打分函数:因此,对于该function,f (h,t)= hT·r·(t),t指的是t复数域上的共轭复数。
反对称性:通过对 hT·r·(~t)的高打分+ (~h)T·r·t低打分训练
对称性:通过把虚部设置为0即可
传递性:不满足,可以通过点积的性质进行简单证明,
传递性建模为:如果有a·b和b·c,是否可以推算出a·c。用(x1,y1)表示a,(x2,y2)表示b,(x3,y3)表示c。随意假设数值,因此有公式如下:
x1x2 + y1y2 = 1
x2x3 + y2y3 = 2
这两个公式无法计算出x1x3+y1y3。
一对多:t1和t2在h上的投影大小相同即可。
The Prowler starred_actors Evelyn Keyes
Robinson Crusoe in_language English
Memory starred_actors Billy Zane
The Wrath of God starred_actors Robert Mitchum
The Departed has_tags police
Kismet in_language English
Hoodwinked! has_tags children
Village of the Damned has_tags remake
Reel Injun written_by Neil Diamond
The Parent Trap starred_actors Dennis Quaid
Tarzan has_tags disney animated feature
Yellow Sky has_genre Western
The Happening release_year 2008
April Fool's Day has_genre Horror
Vantage Point has_tags assassination
A Patch of Blue starred_actors Elizabeth Hartman
Brian's Song starred_actors Jack Warden
...
...
...
model.py
import numpy as np
import torch
from torch.nn.init import xavier_normal_
import torch.nn as nn
import torch.nn.functional as F# 用于将知识图谱的实体、关系转为Embedding
class EmbedModel(torch.nn.Module):def __init__(self, d, ent_vec_dim, rel_vec_dim, **kwargs):super(EmbedModel, self).__init__()self.model_name = kwargs["model_name"]multiplier = 2self.loss_type = kwargs['loss_type']if self.loss_type == 'BCE':self.loss = self.bce_loss # self.loss = torch.nn.BCELoss()self.bce_loss_loss = torch.nn.BCELoss()elif self.loss_type == 'CE':self.loss = self.ce_losselse:# print('Incorrect loss specified:', self.loss_type)exit(0)self.model = self.ComplExself.E_Embedding = torch.nn.Embedding(len(d.entities), ent_vec_dim * multiplier, padding_idx=0)self.R_Embedding = torch.nn.Embedding(len(d.relations), ent_vec_dim * multiplier, padding_idx=0)self.entity_dim = ent_vec_dim * multiplierself.do_batch_norm = Trueif kwargs["do_batch_norm"] == False:self.do_batch_norm = Falseself.input_dropout = torch.nn.Dropout(kwargs["input_dropout"])self.hidden_dropout1 = torch.nn.Dropout(kwargs["hidden_dropout1"])self.hidden_dropout2 = torch.nn.Dropout(kwargs["hidden_dropout2"])self.l3_reg = kwargs["l3_reg"]# BatchNorm1d参数# 输入维度是(N, C, L)时,num_features应该取C;这里N是batch size,C是数据的channel,L是数据长度。# 输入维度是(N, L)时,num_features应该取L;这里N是batch size,L是数据长度,这时可以认为每条数据只有一个channel,省略了Cself.bn0 = torch.nn.BatchNorm1d(num_features=multiplier) self.bn1 = torch.nn.BatchNorm1d(num_features=multiplier)self.bn2 = torch.nn.BatchNorm1d(num_features=multiplier)self.logsoftmax = torch.nn.LogSoftmax(dim=-1)def init(self):xavier_normal_(self.E_Embedding.weight.data)xavier_normal_(self.R_Embedding.weight.data)def freeze_entity_embeddings(self):self.E_Embedding.weight.requires_grad = Falsedef ce_loss(self, pred, true):pred = F.log_softmax(pred, dim=-1)true = true / true.size(-1)loss = -torch.sum(pred * true)return lossdef bce_loss(self, pred, true):loss = self.bce_loss_loss(pred, true)# l3 regularizationif self.l3_reg:norm = torch.norm(self.E_Embedding.weight.data, p=3, dim=-1)loss += self.l3_reg * torch.sum(norm)return lossdef ComplEx(self, head, relation): # head.shape = torch.Size([batch_size, 400]); relation.shape = torch.Size([batch_size, 400])heads_tuple = torch.chunk(head, 2, dim=1) # heads[0].shape = torch.Size([8, 200])# print("model.py---->ComplEx---->heads_tuple[0].shape = {0}".format(heads_tuple[0].shape))head = torch.stack(list(heads_tuple), dim=1) # torch.Size([8, 400])---->torch.Size([8, 2, 200])if self.do_batch_norm:head = self.bn0(head)head = self.input_dropout(head)head = head.permute(1, 0, 2) # torch.Size([8, 2, 200])---->torch.Size([2, 8, 200])# print("model.py---->ComplEx---->head.shape = {0}".format(head.shape))re_head = head[0] # re_head.shape = torch.Size([8, 200])im_head = head[1] # im_head.shape = torch.Size([8, 200])# print("model.py---->ComplEx---->re_head.shape = {0}; im_head.shape = {1}".format(re_head.shape, im_head.shape))relation = self.hidden_dropout1(relation) # relation.shape = torch.Size([8, 400])# print("model.py---->ComplEx---->relation.shape = {0}".format(relation.shape))re_relation, im_relation = torch.chunk(relation, 2, dim=1) # re_relation.shape = torch.Size([8, 200]); im_relation.shape = torch.Size([8, 200])# print("model.py---->ComplEx---->re_relation.shape = {0}; im_relation.shape = {1}".format(re_relation.shape, im_relation.shape))re_tail, im_tail = torch.chunk(self.E_Embedding.weight, 2, dim=1) # re_tail.shape = torch.Size([14541, 200]); im_tail.shape = torch.Size([14541, 200])# print("model.py---->ComplEx---->re_tail.shape = {0}; im_tail.shape = {1}".format(re_tail.shape, im_tail.shape))re_score = re_head * re_relation - im_head * im_relation # re_score.shape = torch.Size([8, 200])im_score = re_head * im_relation + im_head * re_relation # im_score.shape = torch.Size([8, 200])# print("model.py---->ComplEx---->re_score.shape = {0}; im_score.shape = {1}".format(re_score.shape, im_score.shape))score = torch.stack([re_score, im_score], dim=1) # score.shape = torch.Size([8, 2, 200])# print("model.py---->ComplEx---->score.shape = {0}".format(score.shape))if self.do_batch_norm:score = self.bn2(score)score = self.hidden_dropout2(score)score = score.permute(1, 0, 2) # score.shape = torch.Size([2, 8, 200])# print("model.py---->ComplEx---->score.shape = {0}".format(score.shape))re_score = score[0] # re_score.shape = torch.Size([8, 200])im_score = score[1] # im_score.shape = torch.Size([8, 200])score = torch.mm(re_score, re_tail.transpose(1, 0)) + torch.mm(im_score, im_tail.transpose(1, 0)) # score.shape = torch.Size([8, 14541])# print("model.py---->ComplEx---->score.shape = {0}".format(score.shape))return score# e1_idx: 一个batch的头实体ids; tensor([12711, 1016, 11215, 5200, 6072, 8968, 11427, 13015], device='cuda:0')# r_idx: 一个batch的关系ids; tensor([382, 384, 372, 433, 319, 100, 281, 376], device='cuda:0')def forward(self, e1_idx, r_idx):e1 = self.E_Embedding(e1_idx)r = self.R_Embedding(r_idx)# print("model.py---->forward---->e1.shape = {0}; r.shape = {1}".format(e1.shape, r.shape)) # e1.shape = torch.Size([batch_size, 400]); r.shape = torch.Size([batch_size, 400])ans = self.model(head=e1, relation=r)pred = torch.sigmoid(ans)return pred
load_data.py
class Data:def __init__(self, data_dir=None, reverse=False):self.train_data = self.load_data(data_dir, "train", reverse=reverse) # train数据集的所有三元组: (head, relation, tail)self.valid_data = self.load_data(data_dir, "valid", reverse=reverse) # valid数据集的所有三元组: (head, relation, tail)self.test_data = self.load_data(data_dir, "test", reverse=reverse) # test数据集的所有三元组: (head, relation, tail)self.data = self.train_data + self.valid_data + self.test_data # 数据集中所有三元组: (head, relation, tail)self.entities = self.get_entities(self.data) # 数据集中所有实体print("load_data.py---->数据集中实体总数量:len(self.entities) = ", len(self.entities))self.train_relations = self.get_relations(self.train_data) # train数据集的所有三元组中的关系self.valid_relations = self.get_relations(self.valid_data) # valid数据集的所有三元组中的关系self.test_relations = self.get_relations(self.test_data) # test数据集的所有三元组中的关系self.relations = self.train_relations + [i for i in self.valid_relations if i not in self.train_relations] + [i for i in self.test_relations if i not in self.train_relations] # 数据集中所有关系print("load_data.py---->数据集中关系总数量:len(self.relations) = ", len(self.relations))def load_data(self, data_dir, data_type="train", reverse=False):file_path = "%s%s.txt" % (data_dir, data_type) # file_path = data/FB15k-237/train.txtprint("data_dir = {0}; data_type = {1}; file_path = {2}".format(data_dir, data_type, file_path))with open(file_path, "r") as f:data = f.read().strip().split("\n")data = [i.split('\t') for i in data]# 将三元组关系进行翻转,训练数量翻倍if reverse:data += [[i[2], i[1]+"_reverse", i[0]] for i in data]return data# 获取data中所有三元组中的实体def get_entities(self, data):entities = sorted(list(set([d[0] for d in data]+[d[2] for d in data])))return entities# 获取data中所有三元组中的关系def get_relations(self, data):relations = sorted(list(set([d[1] for d in data])))return relations
trainer.py
import numpy as np
import torch
import time
from collections import defaultdict
from model import *
from torch.optim.lr_scheduler import ExponentialLR
from tqdm import tqdm
import osclass Trainer:def __init__(self, d=None, learning_rate=0.0005, ent_vec_dim=200, rel_vec_dim=200, num_iterations=500, batch_size=128, decay_rate=0., cuda=False,input_dropout=0.3, hidden_dropout1=0.4, hidden_dropout2=0.5, label_smoothing=0., outfile='tucker.model', valid_steps=1,loss_type='BCE', do_batch_norm=1, dataset_name='', model_name='ComplEx', l3_reg=0.0, load_from=''):self.d = d # 所有数据集(train数据集、valid数据集、test数据集)self.dataset_name = dataset_name # 数据集名称self.learning_rate = learning_rateself.ent_vec_dim = ent_vec_dim # 实体embedding之后的维度self.rel_vec_dim = rel_vec_dim # 关系embedding之后的维度self.num_epochs = num_iterationsself.batch_size = batch_sizeself.decay_rate = decay_rateself.label_smoothing = label_smoothing # 标签平滑self.cuda = cudaself.outfile = outfileself.valid_steps = valid_stepsself.model_name = model_nameself.l3_reg = l3_regself.loss_type = loss_typeself.load_from = load_fromif do_batch_norm == 1:do_batch_norm = Trueelse:do_batch_norm = Falseself.kwargs = {"input_dropout": input_dropout, "hidden_dropout1": hidden_dropout1, "hidden_dropout2": hidden_dropout2, "model_name": model_name, "loss_type": loss_type, "do_batch_norm": do_batch_norm, "l3_reg": l3_reg}# 将三元组转换为id的形式:['/m/027rn', '/location/country/fo...government', '/m/06cx9']---->(3818, 244, 8942)def get_data_idxs(self, data):data_idxs = [(self.entity2idxs[data[i][0]], self.relation2idxs[data[i][1]], self.entity2idxs[data[i][2]]) for i in range(len(data))]return data_idxs# 获取给定(头实体, 关系)的所有尾实体:(head, relation):[tail01, tail02...]def get_er_vocab(self, data):er_vocab = defaultdict(list)for triple in data:er_vocab[(triple[0], triple[1])].append(triple[2])return er_vocab# 获取一个batch的数据def get_batch(self, er_vocab, er_vocab_pairs, batch_idx):batch = er_vocab_pairs[batch_idx:batch_idx + self.batch_size] # batch_size = 128batch_size = len(batch)num_entities = len(self.d.entities)targets = torch.zeros([batch_size, num_entities], dtype=torch.float32) # targets.shape = torch.Size([128, 14541])# print("\ntrain_embeddings---->trainer.py---->get_batch---->targets.shape = ", targets.shape)if self.cuda:targets = targets.cuda()for batch_idx, pair in enumerate(batch):target_entities_idx = er_vocab[pair]targets[batch_idx, target_entities_idx] = 1.return np.array(batch), targetsdef train_and_eval(self):print("\ntrain_embeddings---->trainer.py---->train_and_eval:")torch.set_num_threads(2)best_valid = [0, 0, 0, 0, 0]best_test = [0, 0, 0, 0, 0]num_entities = len(self.d.entities)num_relations = len(self.d.relations)self.entity2idxs = {self.d.entities[i]: i for i in range(num_entities)} # 实体、id映射表self.relation2idxs = {self.d.relations[i]: i for i in range(num_relations)} # 关系、id映射表# 将实体与id对应的字典保存with open('data/' + self.dataset_name + '/entities.dict', 'w') as f:for key, value in self.entity2idxs.items():f.write(key + '\t' + str(value) + '\n')# 将关系与id对应的字典保存with open('data/' + self.dataset_name + '/relations.dict', 'w') as f:for key, value in self.relation2idxs.items():f.write(key + '\t' + str(value) + '\n')print("trainer.py---->train_and_eval---->len(self.d.train_data) = ", len(self.d.train_data)) # 544230(已经将train.txt中的272115个三元组进行翻转,训练数量翻倍)# 将三元组转换为id的形式:['/m/027rn', '/location/country/fo...government', '/m/06cx9']---->(3818, 244, 8942)train_data_idxs = self.get_data_idxs(self.d.train_data) # train_data_idxs = [(3818, 244, 8942), (819, 460, 9234), (9791, 280, 756), (2522, 24, 7022),...]print("Number of training data points: %d" % len(train_data_idxs)) # 544230(三元组的数量)print('Entities: %d' % len(self.entity2idxs)) # 14541print('Relations: %d' % len(self.relation2idxs)) # 474# 初始化模型(d: 通过Data.py加载的数据; )model = EmbedModel(self.d, self.ent_vec_dim, self.rel_vec_dim, **self.kwargs)model.init() # 初始化参数# 加载已经保存的模型参数if self.load_from != '':fname = self.load_fromcheckpoint = torch.load(fname)model.load_state_dict(checkpoint)# 使用GPUif self.cuda:model.cuda()# 初始化优化器opt = torch.optim.Adam(model.parameters(), lr=self.learning_rate)# 设置学习率if self.decay_rate:scheduler = ExponentialLR(opt, self.decay_rate)er_vocab = self.get_er_vocab(train_data_idxs) # 获取给定(头实体, 关系)的所有尾实体:(head, relation):[tail01, tail02...]; 一共149689个er_vocab_pairs = list(er_vocab.keys()) # 所有的 (head, relation)print("Starting training...")# 开始训练epochfor epoch_idx in range(1, self.num_epochs + 1):start_train = time.time()model.train()losses = []np.random.shuffle(er_vocab_pairs)# 训练batchfor j in tqdm(range(0, len(er_vocab_pairs), self.batch_size)):# print("trainer.py---->train_and_eval---->j = ", j)# 获取一个batch的训练数据X, labels = self.get_batch(er_vocab, er_vocab_pairs, j)# 梯度置零opt.zero_grad()# 获取头实体、关系e1_idx, r_idx = torch.tensor(X[:, 0]), torch.tensor(X[:, 1]) # 头实体的id, 关系的idif self.cuda:e1_idx = e1_idx.cuda()r_idx = r_idx.cuda()# 利用模型根据(头实体,关系)预测(尾实体)predictions = model.forward(e1_idx, r_idx)if self.label_smoothing:labels = ((1.0 - self.label_smoothing) * labels) + (1.0 / labels.size(1))# 计算lossloss = model.loss(predictions, labels) # predictions.shape = torch.Size([8, 14541]) labels.shape = torch.Size([8, 14541])# 梯度反向传播loss.backward()# 更新参数opt.step()losses.append(loss.item())# 每一个epoch进行一个学习率调整if self.decay_rate:scheduler.step()# 每100个epoch打印一次if epoch_idx % 100 == 0:print('Epoch', epoch_idx, ' Epoch time', time.time() - start_train, ' Loss:', np.mean(losses))# 每一个epoch进行一次验证、测试model.eval()with torch.no_grad():if epoch_idx % self.valid_steps == 0:start_test = time.time()print("\n\n开始验证-Valid:")valid = self.evaluate(model, self.d.valid_data)print("\n\n开始测试-Test:")test = self.evaluate(model, self.d.test_data)valid_mrr = valid[0]test_mrr = test[0]if valid_mrr >= best_valid[0]:best_valid = validbest_test = testprint('Validation MRR increased.')print('Saving model...')self.write_embedding_files(model)print('Model saved!')print('Best valid:', best_valid)print('Best Test:', best_test)print('Dataset:', self.dataset_name)print('Model:', self.model_name)print(time.time() - start_test)print('Learning rate %f | Decay %f | Dim %d | Input drop %f | Hidden drop 2 %f | LS %f | Batch size %d | Loss type %s | L3 reg %f' % (self.learning_rate,self.decay_rate,self.ent_vec_dim,self.kwargs["input_dropout"],self.kwargs["hidden_dropout2"],self.label_smoothing,self.batch_size,self.loss_type,self.l3_reg))# 验证def evaluate(self, model, data):print("train_embeddings---->trainer.py---->evaluate:")model.eval()hits = [[] for _ in range(10)]ranks = []# 将所有valid/test数据集三元组转换为id的形式:['/m/027rn', '/location/country/fo...government', '/m/06cx9']---->(3818, 244, 8942)test_data_idxs = self.get_data_idxs(data) # 35070# 获取给定(头实体, 关系)的所有尾实体:(head, relation):[tail01, tail02...]; (5304, 8):[7793, 8554, 1084]er_vocab = self.get_er_vocab(test_data_idxs)print("Number of data points: %d" % len(test_data_idxs)) # 35070# 按照batch_size进行迭代len_test_data_idxs = len(test_data_idxs) # 35070for i in tqdm(range(0, len_test_data_idxs, self.batch_size)):data_batch = np.array(test_data_idxs[i: i + self.batch_size]) # 当前batch的所有数据 data_batch.shape = (batch_size, 3)print("trainer.py---->evaluate---->data_batch.shape = {0}; data_batch = \n{1}".format(data_batch.shape, data_batch))e1_idx = torch.tensor(data_batch[:, 0]) # 头实体 tensor([ 9738, 9271, 2570, 5304, 9589, 12527, 8687, 2560])r_idx = torch.tensor(data_batch[:, 1]) # 关系 tensor([170, 262, 392, 8, 192, 190, 456, 46])e2_idx = torch.tensor(data_batch[:, 2]) # 尾实体 tensor([4553, 4280, 7855, 7793, 7413, 3942, 4366, 8369])if self.cuda:e1_idx = e1_idx.cuda()r_idx = r_idx.cuda()e2_idx = e2_idx.cuda()# 将(头实体,关系)输入模型,预测尾实体predictions = model.forward(e1_idx, r_idx) # predictions.shape = torch.Size([batch_size, 14541])print("trainer.py---->evaluate---->predictions.shape = ", predictions.shape)# following lines commented means RAW evaluation (not filtered)batch_size = data_batch.shape[0]for i in range(batch_size):e1_r_idx = (data_batch[i][0], data_batch[i][1]) # (5304, 8)filt = er_vocab[e1_r_idx] # 当前三元组的(头实体,关系)对应的所有尾实体的index:[7793, 8554, 1084] 根据给定(头实体, 关系)获取所有尾实体:(head, relation):[tail01, tail02...]; (5304, 8):[7793, 8554, 1084]e2_idx_i = e2_idx[i] # 当前三元组样本中真实尾实体的index:7793target_value = predictions[i, e2_idx_i].item() # 预测得到的真实的尾实体的概率 0.17887896299362183# 将(头实体, 关系)对应所有尾实体处的概率先置零,只留当前三元组样本中的尾实体概率predictions[i, filt] = 0.0predictions[i, e2_idx_i] = target_value# 通过sort获取概率最大的尾实体的indexsort_values, sort_idxs = torch.sort(predictions, dim=1, descending=True) # dim=1:为预测尾实体在所有14541候选实体上的概率值sort_idxs = sort_idxs.cpu().numpy() # array([[ 9791, 8454, 4553, ..., 2466, 2058, 8743],for i in range(batch_size):sort_idxs_i = sort_idxs[i] # 当前样本的预测index根据概率值从大到小排序后的排序列表e2_idx_i = e2_idx[i].item() # 当前样本的真实尾实体的index 4553filt_tuple = np.where(sort_idxs_i == e2_idx_i) # 当np.where()内只有一个参数时,那个参数表示条件,当条件成立时,where返回的是每个符合condition条件元素的坐标,返回的是以元组的形式 (array([2]),)rank = filt_tuple[0][0]ranks.append(rank + 1)# 获取hits数据for hits_level in range(10):if rank <= hits_level:hits[hits_level].append(1.0)else:hits[hits_level].append(0.0)print("trainer.py---->evaluate---->len(hits) = ", len(hits)) # (10, 35070)# 分别计算hitat1、hitat3、hitat10、meanrank、mrrhitat10 = np.mean(hits[9]) # 0.24103222127174223hitat3 = np.mean(hits[2]) # 0.15260906757912746hitat1 = np.mean(hits[0]) # 0.09994297120045623meanrank = np.mean(ranks) # 2097.5095238095237mrr = np.mean(1. / np.array(ranks)) # 0.14575755271227386print('Hits @10: {0}'.format(hitat10))print('Hits @3: {0}'.format(hitat3))print('Hits @1: {0}'.format(hitat1))print('Mean rank: {0}'.format(meanrank))print('Mean reciprocal rank: {0}'.format(mrr))return [mrr, meanrank, hitat10, hitat3, hitat1]# 保存实体、关系的Embeddingdef write_embedding_files(self, model):print("\ntrain_embeddings---->trainer.py---->write_embedding_files:")model.eval()model_folder = "kg_embeddings/%s/" % self.dataset_name # 'kg_embeddings/FB15k-237/'data_folder = "data/%s/" % self.dataset_name # 'data/FB15k-237/'print("\ntrain_embeddings---->trainer.py---->write_embedding_files---->model_folder = ", model_folder)print("\ntrain_embeddings---->trainer.py---->write_embedding_files---->data_folder = ", data_folder)embedding_type = self.model_nameif os.path.exists(model_folder) == False:print("创建目录: ", model_folder)os.makedirs(model_folder)E_numpy = model.E_Embedding.weight.data.cpu().numpy()R_numpy = model.R_Embedding.weight.data.cpu().numpy()print("train_embeddings---->trainer.py---->write_embedding_files----E_numpy.shape = ", E_numpy.shape)print("train_embeddings---->trainer.py---->write_embedding_files----R_numpy.shape = ", R_numpy.shape)bn_list = []for bn in [model.bn0, model.bn1, model.bn2]:bn_weight = bn.weight.data.cpu().numpy()bn_bias = bn.bias.data.cpu().numpy()bn_running_mean = bn.running_mean.data.cpu().numpy()bn_running_var = bn.running_var.data.cpu().numpy()bn_numpy = {}bn_numpy['weight'] = bn_weightbn_numpy['bias'] = bn_biasbn_numpy['running_mean'] = bn_running_meanbn_numpy['running_var'] = bn_running_varbn_list.append(bn_numpy)np.save(model_folder + '/E.npy', E_numpy) # 保存实体的Embeddingnp.save(model_folder + '/R.npy', R_numpy) # 保存关系的Embedding# 保存BatchNorm参数for i, bn in enumerate(bn_list):np.save(model_folder + '/bn' + str(i) + '.npy', bn)if embedding_type == 'TuckER':W_numpy = model.W.detach().cpu().numpy()np.save(model_folder + '/W.npy', W_numpy) # 保存权重# ------------------------------------------------------------ 拷贝dict数据 ------------------------------------------------------------# 将数据集文件夹中的entities.dict拷贝到目标文件夹中f1 = open(data_folder + '/entities.dict', 'r')f2 = open(model_folder + '/entities.dict', 'w')ents = {}idx2ent = {}for line in f1:line = line.rstrip().split('\t')name = line[0]id = int(line[1])ents[name] = ididx2ent[id] = namef2.write(str(id) + '\t' + name + '\n')f1.close()f2.close()# 将数据集文件夹中的relations.dict拷贝到目标文件夹中f1 = open(data_folder + '/relations.dict', 'r')f2 = open(model_folder + '/relations.dict', 'w')rels = {}idx2rel = {}for line in f1:line = line.strip().split('\t')name = line[0]id = int(line[1])rels[name] = ididx2rel[id] = namef2.write(str(id) + '\t' + name + '\n')f1.close()f2.close()
utils.py
import os
import torch
import numpy as np
import randomdef seed_everything(seed=1029):'''设置整个开发环境的seed:param seed::param device::return:'''random.seed(seed)os.environ['PYTHONHASHSEED'] = str(seed)np.random.seed(seed)torch.manual_seed(seed)if torch.cuda.is_available:torch.cuda.manual_seed(seed)torch.cuda.manual_seed_all(seed)# some cudnn methods can be random even after fixing the seed unless you tell it to be deterministictorch.backends.cudnn.deterministic = True
main.py
import os
from load_data import Data
import torch
from model import *
from trainer import Trainer
import argparse
from utils import seed_everythingif __name__ == '__main__':parser = argparse.ArgumentParser()parser.add_argument("--dataset_name", type=str, default="MetaQA", nargs="?", help="Which dataset to use: FB15k, FB15k-237, MetaQA, WN18 or WN18RR.")parser.add_argument("--num_iterations", type=int, default=500, nargs="?", help="Number of iterations.")parser.add_argument("--batch_size", type=int, default=5120, nargs="?", help="Batch size.") # 128parser.add_argument("--lr", type=float, default=0.0005, nargs="?", help="Learning rate.")parser.add_argument("--model", type=str, default='ComplEx', nargs="?", help="Model.") # Rotat3parser.add_argument("--dr", type=float, default=1.0, nargs="?", help="Decay rate.")parser.add_argument("--edim", type=int, default=200, nargs="?", help="Entity embedding dimensionality.")parser.add_argument("--rdim", type=int, default=200, nargs="?", help="Relation embedding dimensionality.")parser.add_argument("--cuda", type=bool, default=True, nargs="?", help="Whether to use cuda (GPU) or not (CPU).")parser.add_argument("--input_dropout", type=float, default=0.3, nargs="?", help="Input layer dropout.")parser.add_argument("--hidden_dropout1", type=float, default=0.4, nargs="?", help="Dropout after the first hidden layer.")parser.add_argument("--hidden_dropout2", type=float, default=0.5, nargs="?", help="Dropout after the second hidden layer.")parser.add_argument("--label_smoothing", type=float, default=0.1, nargs="?", help="Amount of label smoothing.")parser.add_argument("--outfile", type=str, default='tucker.model', nargs="?", help="File to save")parser.add_argument("--valid_steps", type=int, default=1, nargs="?", help="Epochs before u validate")parser.add_argument("--loss_type", type=str, default='BCE', nargs="?", help="Loss type")parser.add_argument("--do_batch_norm", type=int, default=1, nargs="?", help="Do batch norm or not (0, 1)")parser.add_argument("--l3_reg", type=float, default=0.0, nargs="?", help="l3 reg hyperparameter")parser.add_argument("--load_from", type=str, default='', nargs="?", help="load from state dict")args = parser.parse_args()os.environ["CUDA_VISIBLE_DEVICES"] = "3"dataset_name = args.dataset_namedata_dir = "data/%s/" % dataset_name # data_dir = data/MetaQA/print("\ntrain_embeddings---->main.py---->data_dir = ", data_dir)# 设置统一的随机种子seed_everything()# 读取并构建数据d = Data(data_dir=data_dir, reverse=True)trainer = Trainer(d=d,num_iterations=args.num_iterations,batch_size=args.batch_size,learning_rate=args.lr,decay_rate=args.dr,ent_vec_dim=args.edim,rel_vec_dim=args.rdim,cuda=args.cuda,input_dropout=args.input_dropout,hidden_dropout1=args.hidden_dropout1,hidden_dropout2=args.hidden_dropout2,label_smoothing=args.label_smoothing,outfile=args.outfile,valid_steps=args.valid_steps,loss_type=args.loss_type,do_batch_norm=args.do_batch_norm,dataset_name=args.dataset_name,model_name=args.model,l3_reg=args.l3_reg,load_from=args.load_from)trainer.train_and_eval()
知识图谱的几个经典模型:TransE、Trans R、ComplEx、ConvKB_0x3fffffff的博客-CSDN博客_知识图谱模型