void line_trainer_edge::init(char edge_type, line_hin *p_hin, int negative) { edge_tp = edge_type; phin = p_hin; neg_samples = negative; line_node *node_u = phin->node_u, *node_v = phin->node_v; if (node_u->vector_size != node_v->vector_size) { printf("ERROR: vector dimsions are not same!\n"); exit(1); } // compute the degree of vertices u_nb_cnt = (int *)calloc(node_u->node_size, sizeof(int)); u_wei = (double *)calloc(node_u->node_size, sizeof(double)); v_wei = (double *)calloc(node_v->node_size, sizeof(double)); for (int u = 0; u != node_u->node_size; u++) { for (int k = 0; k != (int)(phin->hin[u].size()); k++) { int v = phin->hin[u][k].nb_id; char cur_edge_type = phin->hin[u][k].eg_tp; double wei = phin->hin[u][k].eg_wei; if (cur_edge_type != edge_tp) continue; u_nb_cnt[u]++; u_wei[u] += wei; v_wei[v] += wei; } } // allocate spaces for edges u_nb_id = (int **)malloc(node_u->node_size * sizeof(int *)); u_nb_wei = (double **)malloc(node_u->node_size * sizeof(double *)); for (int k = 0; k != node_u->node_size; k++) { u_nb_id[k] = (int *)malloc(u_nb_cnt[k] * sizeof(int)); u_nb_wei[k] = (double *)malloc(u_nb_cnt[k] * sizeof(double)); } // read neighbors int *pst = (int *)calloc(node_u->node_size, sizeof(int)); for (int u = 0; u != node_u->node_size; u++) { for (int k = 0; k != (int)(phin->hin[u].size()); k++) { int v = phin->hin[u][k].nb_id; char cur_edge_type = phin->hin[u][k].eg_tp; double wei = phin->hin[u][k].eg_wei; if (cur_edge_type != edge_tp) continue; u_nb_id[u][pst[u]] = v; u_nb_wei[u][pst[u]] = wei; pst[u]++; } } free(pst); // init sampler for edges smp_u = ransampl_alloc(node_u->node_size); ransampl_set(smp_u, u_wei); smp_u_nb = (ransampl_ws **)malloc(node_u->node_size * sizeof(ransampl_ws *)); for (int k = 0; k != node_u->node_size; k++) { if (u_nb_cnt[k] == 0) continue; smp_u_nb[k] = ransampl_alloc(u_nb_cnt[k]); ransampl_set(smp_u_nb[k], u_nb_wei[k]); } // Init negative sampling table neg_table = (int *)malloc(neg_table_size * sizeof(int)); int a, i; double total_pow = 0, d1; double power = 0.75; for (a = 0; a < node_v->node_size; a++) total_pow += pow(v_wei[a], power); a = 0; i = 0; d1 = pow(v_wei[i], power) / (double)total_pow; while (a < neg_table_size) { if ((a + 1) / (double)neg_table_size > d1) { i++; if (i >= node_v->node_size) {i = node_v->node_size - 1; d1 = 2;} d1 += pow(v_wei[i], power) / (double)total_pow; } else neg_table[a++] = i; } expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real)); for (int i = 0; i < EXP_TABLE_SIZE; i++) { expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1) } // add trans std::string tp = std::string(); tp += edge_tp; if (line_trainer_edge::map_trans[tp] == 0) { line_trans *ptrans = new line_trans; ptrans->init(tp, phin->node_u->vector_size); line_trainer_edge::vec_trans.push_back(ptrans); line_trainer_edge::map_trans[tp] = line_trainer_edge::cnt_trans + 1; line_trainer_edge::cnt_trans++; } }
void line_trainer_path::init(std::string meta_path, line_hin *p_hin, int negative) { path = meta_path; if (path.size() % 2 == 0) { printf("ERROR: meta-path %s error!\n", path.c_str()); exit(1); } path_size = ((int)(path.size()) + 1) / 2; for (int k = 0; k != path_size; k++) path_node.append(1, path[k * 2]); for (int k = 0; k != path_size - 1; k++) path_link.append(1, path[k * 2 + 1]); phin = p_hin; neg_samples = negative; line_node *node_u = p_hin->node_u, *node_v = p_hin->node_v; if (node_u->vector_size != node_v->vector_size) { printf("ERROR: vector dimsions are not same!\n"); exit(1); } expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real)); for (int i = 0; i < EXP_TABLE_SIZE; i++) { expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1) } int node_size = node_u->node_size; dp_cnt = (double **)malloc(node_size * sizeof(double *)); for (int k = 0; k != node_size; k++) dp_cnt[k] = (double *)malloc(path_size * sizeof(double)); for (int i = 0; i != node_size; i++) for (int j = 0; j != path_size; j++) dp_cnt[i][j] = 0; char node_type, link_type; for (int step = path_size - 1; step >= 0; step--) { node_type = path_node[step]; if (step == path_size - 1) { for (int u = 0; u != node_size; u++) if ((node_u->node[u]).type == node_type) dp_cnt[u][step] = 1; } else { link_type = path_link[step]; for (int u = 0; u != node_size; u++) if ((node_u->node[u]).type == node_type) { int neighbor_size = (int)(phin->hin[u].size()); for (int i = 0; i != neighbor_size; i++) { int v = phin->hin[u][i].nb_id; char cur_link_type = phin->hin[u][i].eg_tp; double wei = phin->hin[u][i].eg_wei; if ((node_u->node[v]).type == path_node[step + 1] && link_type == cur_link_type) dp_cnt[u][step] += dp_cnt[v][step + 1] * wei; } } } } // Init negative sampling table neg_table = (int **)malloc(path_size * sizeof(int *)); for (int k = 1; k != path_size; k++) neg_table[k] = (int *)malloc(neg_table_size * sizeof(int)); for (int k = 1; k != path_size; k++) { int a, i; double total_pow = 0, d1; double power = 1;//0.75; for (a = 0; a < node_size; a++) total_pow += pow(dp_cnt[a][k], power); a = 0; i = 0; d1 = pow(dp_cnt[i][k], power) / (double)total_pow; while (a < neg_table_size) { if ((a + 1) / (double)neg_table_size > d1) { i++; if (i >= node_size) {i = node_size - 1; d1 = 2;} d1 += pow(dp_cnt[i][k], power) / (double)total_pow; } else neg_table[k][a++] = i; } } int node_cnt; // Init the sampling table of step 0 node_cnt = 0; node_type = path_node[0]; for (int u = 0; u != node_size; u++) if ((node_u->node[u]).type == node_type) node_cnt++; smp_init_index = (int *)malloc(node_cnt * sizeof(int)); smp_init_weight = (double *)malloc(node_cnt * sizeof(double)); smp_init = ransampl_alloc(node_cnt); node_cnt = 0; for (int u = 0; u != node_size; u++) if ((node_u->node[u]).type == node_type) { smp_init_index[node_cnt] = u; smp_init_weight[node_cnt] = dp_cnt[u][0]; node_cnt++; } ransampl_set(smp_init, smp_init_weight); // Init sampling tables of the following steps smp_dp_index = (int ***)malloc(path_size * sizeof(int **)); for (int k = 0; k != path_size; k++) smp_dp_index[k] = (int **)malloc(node_size * sizeof(int *)); for (int i = 0; i != path_size; i++) for (int j = 0; j != node_size; j++) smp_dp_index[i][j] = NULL; smp_dp_weight = (double ***)malloc(path_size * sizeof(double **)); for (int k = 0; k != path_size; k++) smp_dp_weight[k] = (double **)malloc(node_size * sizeof(double *)); for (int i = 0; i != path_size; i++) for (int j = 0; j != node_size; j++) smp_dp_weight[i][j] = NULL; smp_dp = (ransampl_ws ***)malloc(path_size * sizeof(ransampl_ws)); for (int k = 0; k != path_size; k++) smp_dp[k] = (ransampl_ws **)malloc(node_size * sizeof(ransampl_ws)); for (int i = 0; i != path_size; i++) for (int j = 0; j != node_size; j++) smp_dp[i][j] = NULL; for (int step = 0; step != path_size - 1; step++) { node_type = path_node[step]; link_type = path_link[step]; for (int u = 0; u != node_size; u++) if((node_u->node[u]).type == node_type) { node_cnt = 0; int neighbor_size = (int)(phin->hin[u].size()); for (int i = 0; i != neighbor_size; i++) { int v = phin->hin[u][i].nb_id; char cur_edge_tp = phin->hin[u][i].eg_tp; if ((node_u->node[v]).type == path_node[step + 1] && cur_edge_tp == link_type) node_cnt++; } if (node_cnt == 0) continue; smp_dp_index[step][u] = (int *)malloc(node_cnt * sizeof(int)); smp_dp_weight[step][u] = (double *)malloc(node_cnt * sizeof(double)); smp_dp[step][u] = ransampl_alloc(node_cnt); node_cnt = 0; for (int i = 0; i != neighbor_size; i++) { int v = phin->hin[u][i].nb_id; char cur_edge_tp = phin->hin[u][i].eg_tp; double wei = phin->hin[u][i].eg_wei; if ((node_u->node[v]).type == path_node[step + 1] && cur_edge_tp == link_type) { smp_dp_index[step][u][node_cnt] = v; smp_dp_weight[step][u][node_cnt] = dp_cnt[v][step + 1] * wei; node_cnt++; } } ransampl_set(smp_dp[step][u], smp_dp_weight[step][u]); } } // add trans for (int i = 0; i < path_size; i++) for (int j = i + 1; j < path_size; j++) { std::string tp = path.substr(i * 2, (j - i) * 2 + 1); if (line_trainer_path::map_trans[tp] == 0) { line_trans *ptrans = new line_trans; ptrans->init(tp, phin->node_u->vector_size); line_trainer_path::vec_trans.push_back(ptrans); line_trainer_path::map_trans[tp] = line_trainer_path::cnt_trans + 1; line_trainer_path::cnt_trans++; } } }
void line_link::init_transpose(char *file_name, line_node *p_u, line_node *p_v, int negative) { strcpy(link_file, file_name); node_u = p_u; node_v = p_v; neg_samples = negative; if (node_u->get_vector_dim() != node_v->get_vector_dim()) { printf("ERROR: vector dimsions are not same!\n"); exit(1); } dgr_u = (double *)calloc(node_u->node_size, sizeof(double)); dgr_v = (double *)calloc(node_v->node_size, sizeof(double)); // compute the number of edges char str[2 * MAX_STRING + 10000]; FILE *fi = fopen(link_file, "rb"); if (fi == NULL) { printf("ERROR: link file not found!\n"); printf("%s\n", link_file); exit(1); } edge_cnt = 0; while (fgets(str, sizeof(str), fi)) edge_cnt++; fclose(fi); // allocate spaces int u, v; double wei; edge_u = (int *)malloc(edge_cnt * sizeof(int)); edge_v = (int *)malloc(edge_cnt * sizeof(int)); edge_w = (double *)malloc(edge_cnt * sizeof(double)); if (edge_u == NULL || edge_v == NULL || edge_w == NULL) { printf("Error: memory allocation failed!\n"); exit(1); } graph = new std::vector<struct_neighbor>[node_u->node_size]; nb_set = new std::set<int>[node_u->node_size]; struct_neighbor cur_nb; // read edges fi = fopen(link_file, "rb"); for (int k = 0; k != edge_cnt; k++) { fscanf(fi, "%d %d %lf", &v, &u, &wei); if (k % 10000 == 0) { printf("Reading edges: %.3lf%%%c", k / (double)(edge_cnt + 1) * 100, 13); fflush(stdout); } // store edges edge_u[k] = u; edge_v[k] = v; edge_w[k] = wei; // update degrees dgr_u[u] += wei; dgr_v[v] += wei; // update graph cur_nb.index = v; cur_nb.wei = wei; graph[u].push_back(cur_nb); // update neibor set nb_set[u].insert(v); } fclose(fi); // initialize edge sampler ws = ransampl_alloc(edge_cnt); ransampl_set(ws, edge_w); // compute the value of exp function before training expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real)); for (int i = 0; i < EXP_TABLE_SIZE; i++) { expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1) } // initialize the negative sampling table int a, i; double total_pow = 0, d1; real power = 0.75; neg_table_u = (int *)calloc(neg_table_size, sizeof(int)); neg_table_v = (int *)calloc(neg_table_size, sizeof(int)); total_pow = 0; for (a = 0; a < node_u->node_size; a++) total_pow += pow(dgr_u[a], power); i = 0; d1 = pow(dgr_u[i], power) / (real)total_pow; for (a = 0; a < neg_table_size; a++) { neg_table_u[a] = i; if (a / (real)neg_table_size > d1) { i++; d1 += pow(dgr_u[i], power) / (real)total_pow; } if (i >= node_u->node_size) i = node_u->node_size - 1; } total_pow = 0; for (a = 0; a < node_v->node_size; a++) total_pow += pow(dgr_v[a], power); i = 0; d1 = pow(dgr_v[i], power) / (real)total_pow; for (a = 0; a < neg_table_size; a++) { neg_table_v[a] = i; if (a / (real)neg_table_size > d1) { i++; d1 += pow(dgr_v[i], power) / (real)total_pow; } if (i >= node_v->node_size) i = node_v->node_size - 1; } printf("Reading edges from file: %s, DONE!\n", link_file); printf("Edge size: %lld\n", edge_cnt); }