void read_customers(FILE *ifp) { char *full = NULL; char *name = NULL; char *customer_id = NULL; char *balance = NULL; char *address1 = NULL; char *address2 = NULL; char *address3 = NULL; size_t len = 0; while (!feof(ifp)) { getline(&full, &len, ifp); full = strtok(full, "\n"); char *temp; temp = full; name = strtok(temp, "|"); customer_id = strtok(NULL, "|"); balance = strtok(NULL, "|"); address1 = strtok(NULL, "|"); address2 = strtok(NULL, "|"); address3 = strtok(NULL, "|"); add_customer(name, customer_id, balance, address1, address2, address3); } }
// read in the input file // and figure out pairs that // we want to train on void readpurchases(FILE* fp) { long long pk = 0; long id, company, brand, quantity; long customer_loc, product_loc; int parse_return; char dump[MAX_STRING+1]; // get the first line fgets(dump, MAX_STRING, fp); printf(" |\r"); while (!feof(fp)) { if ((pk*30)%INTERACTIONS==0) { printf("#"); fflush(stdout); } fgets(dump, MAX_STRING, fp); parse_return = parseline(dump, &id, &company, &brand, &quantity); if (parse_return != 0) continue; customer_loc = find_customer(id); if (customer_loc == -1) customer_loc = add_customer(id); product_loc = find_product(company, brand); if (product_loc == -1) product_loc = add_product(company, brand); for (int i=0; i<quantity; i++) { purchases[pk].custp = customers[customer_loc]; purchases[pk].prodp = products[product_loc]; pk++; if (pk > INTERACTIONS) { log_err("pk > INTERACTIONS"); exit(1); } } } printf("total pk: %lld\n", pk); }
// learn some stuff void run(FILE *fp) { clock_t start = clock(); clock_t now; int label; debug("Populating Hashes..."); long id, company, brand, quantity; rewind(fp); char dump[MAX_STRING+1]; // get the first line fgets(dump, MAX_STRING, fp); // create the temporary arrays real *custupdate = calloc(D, sizeof(real)); real *produpdate = calloc(D, sizeof(real)); real *cv; real *pv; real *randcv; real *randpv; real dot, mult; long customer_loc, product_loc; long linenum = 1; // file is in format <id,chain,dept,category,company,brand,date,productsize,productmeasure,purchasequantity,purchaseamount> while (!feof(fp)) { // get the next line from the file fgets(dump, MAX_STRING, fp); linenum++; sscanf(dump, "%ld,%*ld,%*ld,%*ld,%ld,%ld,%*25[^,],%*ld,%*30[^,],%ld,%*s", &id, &company, &brand, &quantity); quantity = 1; /* debug("Found id: %ld, company: %ld, brand: %ld", id, company, brand); */ customer_loc = find_customer(id); if (customer_loc == -1) customer_loc = add_customer(id); product_loc = find_product(company, brand); if (product_loc == -1) product_loc = add_product(company, brand); // Do the update label = 1; cv = customer_vecs + customer_loc*D; pv = product_vecs + product_loc*D; alpha = ALPHA; alpha = ALPHA * (1. - linenum / (real)(LINES + 1.)); if (alpha < ALPHA * 0.0001) alpha = ALPHA * 0.0001; /* debug("Looking at customer: %ld, product: %ld, dot: %g, mult: %g", customer_loc, product_loc, dot, mult); */ // adjust the weights dot = 0.; for (int i=0; i<D; i++) dot += cv[i]*pv[i]; mult = quantity*getmult(1., dot)*alpha; for (int i=0; i<D; i++) custupdate[i] = mult*pv[i]; for (int i=0; i<D; i++) produpdate[i] = mult*cv[i]; for (int i=0; i<quantity*NEGS; i++) { long randp = (lqrand()%PRODS); randpv = product_vecs + D*randp; // get the dot product dot = 0.; for (int i=0; i<D; i++) dot += cv[i]*randpv[i]; // get the multiplier /* mult = getmult(0., dot)*alpha/(NEGS+0.); */ mult = getmult(0., dot)*alpha; // adjust the weights for (int i=0; i<D; i++) custupdate[i] += mult*randpv[i]; for (int i=0; i<D; i++) randpv[i] += mult*cv[i]; } for (int i=0; i<quantity*NEGS; i++) { long randc = (lqrand()%CUSTS); randcv = customer_vecs + D*randc; // get the dot product dot = 0.; for (int i=0; i<D; i++) dot += randcv[i]*pv[i]; // get the multiplier /* mult = getmult(0., dot)*alpha/(NEGS+0.); */ mult = getmult(0., dot)*alpha; // adjust the weights for (int i=0; i<D; i++) produpdate[i] += mult*randcv[i]; for (int i=0; i<D; i++) randcv[i] += mult*pv[i]; } // apply updates for (int i=0; i<D; i++) cv[i] += custupdate[i]; for (int i=0; i<D; i++) pv[i] += produpdate[i]; for (int i=0; i<D; i++) if (isnan(cv[i]) || isnan(pv[i])) { log_err("We've hit a nan!!!!, linenum=%ld, line=%s", linenum, dump); exit(1); } if (linenum%10000 == 0) { /* double totcupdate = 0.; */ /* double totpupdate = 0.; */ /* for (int i=0; i<D; i++) totcupdate += custupdate[i]*custupdate[i]; */ /* for (int i=0; i<D; i++) totpupdate += produpdate[i]*produpdate[i]; */ /* double totcv = 0.; */ /* double totpv = 0.; */ /* for (int i=0; i<D; i++) totcv += cv[i]*cv[i]; */ /* for (int i=0; i<D; i++) totpv += pv[i]*pv[i]; */ /* double totcsize = 0.; */ /* double totpsize = 0.; */ /* for (long i=0; i<D*CUSTS; i++) totcsize += customer_vecs[i]*customer_vecs[i]; */ /* for (long i=0; i<D*CUSTS; i++) totpsize += product_vecs[i]*product_vecs[i]; */ now = clock(); int seconds_remaining = (int)((now - start)/(CLOCKS_PER_SEC+0.)*LINES/(linenum+0.)); int hours = seconds_remaining/(60*60); seconds_remaining -= hours*60*60; int minutes = seconds_remaining/60; seconds_remaining -= minutes*60; printf("%c%ldK lines processed. %.2f%% done. alpha=%g, num_customers=%ld, num_products=%ld. est time remaining %dh%2dm ", 13, linenum/1000, linenum/(LINES+0.)*100., alpha, num_customers, num_products, hours, minutes); /* printf("%c%ldK lines processed. %.2f%% done. alpha=%g, num_customers=%ld, num_products=%ld. est time remaining %dh%2dm, csize=%g,%g psize=%g,%g ", */ /* 13, linenum/1000, linenum/(LINES+0.)*100., alpha, num_customers, num_products, hours, minutes, */ /* sqrt(totcupdate), sqrt(totcv), */ /* /1* sqrt(totcsize), *1/ */ /* sqrt(totpupdate), sqrt(totpv) ); */ /* /1* sqrt(totpsize) ); *1/ */ fflush(stdout); } if (linenum%10000000 == 0) { FILE *fc = fopen(CUSTFILE,"w"); print_customers(fc); fclose(fc); FILE *fp = fopen(PRODFILE,"w"); print_products(fp); fclose(fp); } if (linenum>LINES-1) break; /* if (linenum > 100000) break; */ } printf("\n"); }