代码可以在github上fork,本文主要是加了一些注释,并且搭配本人所作笔记【HGAN代码加笔记的理解】
main.py
import torch from sklearn.metrics import f1_score from utils import load_data, EarlyStopping def score(logits, labels): #在类的方法或属性前加一个“_”单下划线,意味着该方法或属性不应该去调用,它并不属于API。 _, indices = torch.max(logits, dim=1) prediction = indices.long().cpu().numpy() labels = labels.cpu().numpy() accuracy = (prediction == labels).sum() / len(prediction) micro_f1 = f1_score(labels, prediction, average='micro') macro_f1 = f1_score(labels, prediction, average='macro') return accuracy, micro_f1, macro_f1 def evaluate(model, g, features, labels, mask, loss_func): model.eval() with torch.no_grad(): logits = model(g, features) loss = loss_func(logits[mask], labels[mask]) accuracy, micro_f1, macro_f1 = score(logits[mask], labels[mask]) return loss, accuracy, micro_f1, macro_f1 def main(args): # If args['hetero'] is True, g would be a heterogeneous graph. # Otherwise, it will be a list of homogeneous graphs. #Python中反斜杠也可以用在一行结尾做续行符使用 g, features, labels, num_classes, train_idx, val_idx, test_idx, train_mask, val_mask, test_mask = load_data(args['dataset']) if hasattr(torch, 'BoolTensor'): #hasattr() 函数用于判断对象是否包含对应的属性,此处是判断torch是否有'BoolTensor'属性,如果对象有该属性,则返回TRUE #torch.DoubleTensor对应torch.float64; 猜测同理,BoolTensor应该也是Bool类型 train_mask = train_mask.bool() val_mask = val_mask.bool() test_mask = test_mask.bool() # .to() 是cpu,args['device']结果是cpu features = features.to(args['device']) labels = labels.to(args['device']) train_mask = train_mask.to(args['device']) val_mask = val_mask.to(args['device']) test_mask = test_mask.to(args['device']) if args['hetero']: #构建异构图的邻居节点 from model_hetero import HAN #此处的HAN跳进的是model_hetro的模型,猜测是,如果已经有异构图了,就找这样的元路径,然后做节点级别和语义级别的注意力机制 model = HAN(meta_paths=[['pa', 'ap'], ['pf', 'fp']],#之前构建的边:pa, ap,组合成mete-path:PAP in_size=features.shape[1], hidden_size=args['hidden_units'], out_size=num_classes, num_heads=args['num_heads'], dropout=args['dropout']).to(args['device']) g = g.to(args['device']) else: #自己构建异构图 from model import HAN model = HAN(num_meta_paths=len(g),#meta_path的长度,g即之前定义的邻接矩阵,3 in_size=features.shape[1],#特征的维度 1870维 hidden_size=args['hidden_units'], out_size=num_classes, num_heads=args['num_heads'],#使用多头的数量 dropout=args['dropout']).to(args['device']) g = [graph.to(args['device']) for graph in g] stopper = EarlyStopping(patience=args['patience']) loss_fcn = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'], weight_decay=args['weight_decay']) for epoch in range(args['num_epochs']): model.train() logits = model(g, features) loss = loss_fcn(logits[train_mask], labels[train_mask]) optimizer.zero_grad() loss.backward() optimizer.step() train_acc, train_micro_f1, train_macro_f1 = score(logits[train_mask], labels[train_mask]) val_loss, val_acc, val_micro_f1, val_macro_f1 = evaluate(model, g, features, labels, val_mask, loss_fcn) early_stop = stopper.step(val_loss.data.item(), val_acc, model) print('Epoch {:d} | Train Loss {:.4f} | Train Micro f1 {:.4f} | Train Macro f1 {:.4f} | ' 'Val Loss {:.4f} | Val Micro f1 {:.4f} | Val Macro f1 {:.4f}'.format( epoch + 1, loss.item(), train_micro_f1, train_macro_f1, val_loss.item(), val_micro_f1, val_macro_f1)) if early_stop: break stopper.load_checkpoint(model) test_loss, test_acc, test_micro_f1, test_macro_f1 = evaluate(model, g, features, labels, test_mask, loss_fcn) print('Test loss {:.4f} | Test Micro f1 {:.4f} | Test Macro f1 {:.4f}'.format( test_loss.item(), test_micro_f1, test_macro_f1)) if __name__ == '__main__': import argparse from utils import setup parser = argparse.ArgumentParser('HAN') parser.add_argument('-s', '--seed', type=int, default=1, help='Random seed') parser.add_argument('-ld', '--log-dir', type=str, default='results', help='Dir for saving training results') parser.add_argument('--hetero', action='store_true', help='Use metapath coalescing with DGL's own dataset') args = parser.parse_args().__dict__ args = setup(args) main(args)
model.py
import torch import torch.nn as nn import torch.nn.functional as F # DGL是一个专门用于深度学习图形的Python包, 一款面向图神经网络以及图机器学习的全新框架, 简化了基于图形的神经网络的实现。 from dgl.nn.pytorch import GATConv class SemanticAttention(nn.Module): def __init__(self, in_size, hidden_size=128): super(SemanticAttention, self).__init__() self.project = nn.Sequential(#定义了2层的一个全连接层 nn.Linear(in_size, hidden_size),#定义了一个全连接层 nn.Tanh(),#激活函数 nn.Linear(hidden_size, 1, bias=False) ) def forward(self, z): w = self.project(z).mean(0) # 每个节点在meta_path维度的均值;mean(0):每个meta path上的均值(/|V|) # (M, 1) beta = torch.softmax(w, dim=0) #归一化,之后得到 每个系数前面的beta # (M, 1) beta = beta.expand((z.shape[0],) + beta.shape) # (N, M, 1) #beta * z.shape ==> [3025,2,64] 2表示在两个meta——path下面,64即node_embedding时候输出的一个维度 return (beta * z).sum(1) # (N, D * K) 输出节点数量和节点最终的一个embedding class HANLayer(nn.Module): """ HAN layer. Arguments --------- num_meta_paths : number of homogeneous graphs generated from the metapaths. in_size : input feature dimension out_size : output feature dimension layer_num_heads : number of attention heads dropout : Dropout probability Inputs ------ g : list[DGLGraph] List of graphs h : tensor Input features Outputs ------- tensor The output feature """ def __init__(self, num_meta_paths, in_size, out_size, layer_num_heads, dropout): super(HANLayer, self).__init__() # One GAT layer for each meta path based adjacency matrix self.gat_layers = nn.ModuleList() for i in range(num_meta_paths): #meta_path Layers;两个meta_path的维度是一致的 #out_size 即规定的Hidden_layer的size self.gat_layers.append(GATConv(in_size, out_size, layer_num_heads, dropout, dropout, activation=F.elu)) self.semantic_attention = SemanticAttention(in_size=out_size * layer_num_heads)#语义attention self.num_meta_paths = num_meta_paths def forward(self, gs, h): semantic_embeddings = [] for i, g in enumerate(gs):#每个meta_path的图信息,求节点的attention semantic_embeddings.append(self.gat_layers[i](g, h).flatten(1))#两个gat,每个meta_path对应一个GAT #self.gat_layers[i](g, h).shape得到的是torch.size([3025,8,8]) 3025是节点数量,8是多头,8是hidden_layer数量 #flatten之后就会组合成为3025*64 semantic_embeddings = torch.stack(semantic_embeddings, dim=1)#stack之后,就会变成[3025,2,64]2即表示有2个metapath # (N, M, D * K) #聚合meta_path下,每个节点最终的输出值 return self.semantic_attention(semantic_embeddings) # (N, D * K) class HAN(nn.Module): def __init__(self, num_meta_paths, in_size, hidden_size, out_size, num_heads, dropout): super(HAN, self).__init__() self.layers = nn.ModuleList() #num_heads:即做几层的HAN,多头的列表的第0维 #nn.moduleList定义对象后,有extend和append方法,用法和python中一样,extend是添加另一个modulelist append是添加另一个module。 self.layers.append(HANLayer(num_meta_paths, in_size, hidden_size, num_heads[0], dropout)) for l in range(1, len(num_heads)):#多层多头,目前是没有 self.layers.append(HANLayer(num_meta_paths, hidden_size * num_heads[l-1], hidden_size, num_heads[l], dropout)) self.predict = nn.Linear(hidden_size * num_heads[-1], out_size)#out_size是3,从outsize = num_classes=labels.shape[1]知道的 def forward(self, g, h): for gnn in self.layers:#GAT_GAT 节点级别的GAT;semantic_attention语义级别attention h = gnn(g, h)#g即多个meta_path下所形成的邻接矩阵;h即节点的特征 return self.predict(h)
utils.py
import datetime import dgl import errno import numpy as np import os import pickle import random import torch from dgl.data.utils import download, get_download_dir, _get_dgl_url from pprint import pprint from scipy import sparse from scipy import io as sio # utils 具体处理数据加载 和 早停策略。 def set_random_seed(seed=0): """Set random seed. Parameters ---------- seed : int Random seed to use """ random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) def mkdir_p(path, log=True): """Create a directory for the specified path. Parameters ---------- path : str Path name log : bool Whether to print result for directory creation """ try: os.makedirs(path) if log: print('Created directory {}'.format(path)) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path) and log: print('Directory {} already exists.'.format(path)) else: raise def get_date_postfix(): """Get a date based postfix for directory name. Returns ------- post_fix : str """ dt = datetime.datetime.now() post_fix = '{}_{:02d}-{:02d}-{:02d}'.format( dt.date(), dt.hour, dt.minute, dt.second) return post_fix def setup_log_dir(args, sampling=False): """Name and create directory for logging. Parameters ---------- args : dict Configuration Returns ------- log_dir : str Path for logging directory sampling : bool Whether we are using sampling based training """ date_postfix = get_date_postfix() log_dir = os.path.join( args['log_dir'], '{}_{}'.format(args['dataset'], date_postfix)) if sampling: log_dir = log_dir + '_sampling' mkdir_p(log_dir) return log_dir # The configuration below is from the paper. default_configure = { 'lr': 0.005, # Learning rate 'num_heads': [8], # Number of attention heads for node-level attention 'hidden_units': 8, 'dropout': 0.6, 'weight_decay': 0.001, 'num_epochs': 200, 'patience': 100 } sampling_configure = { 'batch_size': 20 } def setup(args): args.update(default_configure) set_random_seed(args['seed']) args['dataset'] = 'ACMRaw' if args['hetero'] else 'ACM' args['device'] = 'cuda:0' if torch.cuda.is_available() else 'cpu' args['log_dir'] = setup_log_dir(args) return args def setup_for_sampling(args): args.update(default_configure) args.update(sampling_configure) set_random_seed() args['device'] = 'cuda:0' if torch.cuda.is_available() else 'cpu' args['log_dir'] = setup_log_dir(args, sampling=True) return args def get_binary_mask(total_size, indices): mask = torch.zeros(total_size) mask[indices] = 1 return mask.byte() def load_acm(remove_self_loop): url = 'dataset/ACM3025.pkl' data_path = get_download_dir() + '/ACM3025.pkl' #download(_get_dgl_url(url), path=data_path) #下载过一次之后就不需要重复去下载了,就可以注释掉了 with open(data_path, 'rb') as f: data = pickle.load(f) #todense()即转换为数组,long() 函数将数字或字符串转换为一个长整型。 labels, features = torch.from_numpy(data['label'].todense()).long(), torch.from_numpy(data['feature'].todense()).float() num_classes = labels.shape[1] labels = labels.nonzero()[:, 1]#将之前的one_hot编码转换成类别数字型 if remove_self_loop:#如果有环,就把环去除 num_nodes = data['label'].shape[0] #np.eye()的函数,除了生成对角阵外,还可以将一个label数组,大小为(1,m)或者(m,1)的数组,转化成one-hot数组 #csr中r即行,行优先 data['PAP'] = sparse.csr_matrix(data['PAP'] - np.eye(num_nodes)) data['PLP'] = sparse.csr_matrix(data['PLP'] - np.eye(num_nodes)) # Adjacency matrices for meta path based neighbors # (Mufei): I verified both of them are binary adjacency matrices with self loops author_g = dgl.from_scipy(data['PAP'])#定义一个p-a-p的meta-path subject_g = dgl.from_scipy(data['PLP'])#定义一个p-l-p的meta_path gs = [author_g, subject_g] #将两个meta_path形成的图组合在一起 #从numpy数组创建一个张量,数组和张量共享相同内存。返回的张量和ndarray共享相同的内存。 #对张量的修改将反映在ndarray中,反之亦然。 返回的张量不可调整大小。 train_idx = torch.from_numpy(data['train_idx']).long().squeeze(0) val_idx = torch.from_numpy(data['val_idx']).long().squeeze(0) test_idx = torch.from_numpy(data['test_idx']).long().squeeze(0) num_nodes = author_g.number_of_nodes() #mask是把它对应位置上的节点设置为1,其余位置为0 train_mask = get_binary_mask(num_nodes, train_idx) val_mask = get_binary_mask(num_nodes, val_idx) test_mask = get_binary_mask(num_nodes, test_idx) print('dataset loaded') pprint({ 'dataset': 'ACM', 'train': train_mask.sum().item() / num_nodes, 'val': val_mask.sum().item() / num_nodes, 'test': test_mask.sum().item() / num_nodes }) return gs, features, labels, num_classes, train_idx, val_idx, test_idx, train_mask, val_mask, test_mask def load_acm_raw(remove_self_loop): assert not remove_self_loop url = 'dataset/ACM.mat' data_path = get_download_dir() + '/ACM.mat' download(_get_dgl_url(url), path=data_path) data = sio.loadmat(data_path) p_vs_l = data['PvsL'] # paper-field? p_vs_a = data['PvsA'] # paper-author p_vs_t = data['PvsT'] # paper-term, bag of words p_vs_c = data['PvsC'] # paper-conference, labels come from that # We assign # (1) KDD papers as class 0 (data mining), # (2) SIGMOD and VLDB papers as class 1 (database), # (3) SIGCOMM and MOBICOMM papers as class 2 (communication) conf_ids = [0, 1, 9, 10, 13] label_ids = [0, 1, 2, 2, 1] p_vs_c_filter = p_vs_c[:, conf_ids] p_selected = (p_vs_c_filter.sum(1) != 0).A1.nonzero()[0] p_vs_l = p_vs_l[p_selected] p_vs_a = p_vs_a[p_selected] p_vs_t = p_vs_t[p_selected] p_vs_c = p_vs_c[p_selected] hg = dgl.heterograph({ ('paper', 'pa', 'author'): p_vs_a.nonzero(), ('author', 'ap', 'paper'): p_vs_a.transpose().nonzero(), ('paper', 'pf', 'field'): p_vs_l.nonzero(), ('field', 'fp', 'paper'): p_vs_l.transpose().nonzero() }) features = torch.FloatTensor(p_vs_t.toarray()) pc_p, pc_c = p_vs_c.nonzero() labels = np.zeros(len(p_selected), dtype=np.int64) for conf_id, label_id in zip(conf_ids, label_ids): labels[pc_p[pc_c == conf_id]] = label_id labels = torch.LongTensor(labels) num_classes = 3 float_mask = np.zeros(len(pc_p)) for conf_id in conf_ids: pc_c_mask = (pc_c == conf_id) float_mask[pc_c_mask] = np.random.permutation(np.linspace(0, 1, pc_c_mask.sum())) train_idx = np.where(float_mask <= 0.2)[0] val_idx = np.where((float_mask > 0.2) & (float_mask <= 0.3))[0] test_idx = np.where(float_mask > 0.3)[0] num_nodes = hg.number_of_nodes('paper') train_mask = get_binary_mask(num_nodes, train_idx) val_mask = get_binary_mask(num_nodes, val_idx) test_mask = get_binary_mask(num_nodes, test_idx) return hg, features, labels, num_classes, train_idx, val_idx, test_idx, train_mask, val_mask, test_mask def load_data(dataset, remove_self_loop=False): if dataset == 'ACM': return load_acm(remove_self_loop) elif dataset == 'ACMRaw': return load_acm_raw(remove_self_loop) else: return NotImplementedError('Unsupported dataset {}'.format(dataset)) class EarlyStopping(object): def __init__(self, patience=10): dt = datetime.datetime.now() self.filename = 'early_stop_{}_{:02d}-{:02d}-{:02d}.pth'.format( dt.date(), dt.hour, dt.minute, dt.second) self.patience = patience self.counter = 0 self.best_acc = None self.best_loss = None self.early_stop = False def step(self, loss, acc, model): if self.best_loss is None: self.best_acc = acc self.best_loss = loss self.save_checkpoint(model) elif (loss > self.best_loss) and (acc < self.best_acc): self.counter += 1 print(f'EarlyStopping counter: {self.counter} out of {self.patience}') if self.counter >= self.patience: self.early_stop = True else: if (loss <= self.best_loss) and (acc >= self.best_acc): self.save_checkpoint(model) self.best_loss = np.min((loss, self.best_loss)) self.best_acc = np.max((acc, self.best_acc)) self.counter = 0 return self.early_stop def save_checkpoint(self, model): """Saves model when validation loss decreases.""" torch.save(model.state_dict(), self.filename) def load_checkpoint(self, model): """Load the latest checkpoint.""" model.load_state_dict(torch.load(self.filename))