void CTilegenAction_FilterCandidatesByDirection::Execute( CLayoutSystem *pLayoutSystem ) { CUtlVector< CRoomCandidate > *pRoomCandidateList = pLayoutSystem->GetRoomCandidateList(); if ( pRoomCandidateList->Count() == 0 ) { return; } const char *pDirection = m_pDirectionExpression->Evaluate( pLayoutSystem->GetFreeVariables() ); int nThreshold = m_pThresholdExpression->Evaluate( pLayoutSystem->GetFreeVariables() ); nThreshold = MAX( 0, nThreshold ); ExitDirection_t direction = GetDirectionFromString( pDirection ); if ( direction < EXITDIR_BEGIN || direction >= EXITDIR_END ) { Log_Warning( LOG_TilegenLayoutSystem, "Invalid direction specified: %s.\n", pDirection ); return; } // First go through and figure out the highest score int nHighScore = INT_MIN; for ( int i = 0; i < pRoomCandidateList->Count(); ++ i ) { const CRoomCandidate *pCandidate = &pRoomCandidateList->Element( i ); int nScore = ComputeScore( direction, pCandidate->m_iXPos, pCandidate->m_iYPos ); if ( nScore > nHighScore ) { nHighScore = nScore; } } // Now go through and set the chance of each candidate to 1.0f for any with that score or 0.0f for those with a lower score // @TODO: allow for specifying a numerical range in which candidates are chosen for ( int i = pRoomCandidateList->Count() - 1; i >= 0; -- i ) { const CRoomCandidate *pCandidate = &pRoomCandidateList->Element( i ); if ( ComputeScore( direction, pCandidate->m_iXPos, pCandidate->m_iYPos ) < ( nHighScore - nThreshold ) ) { pRoomCandidateList->FastRemove( i ); } } }
// does the two-step clustering algorithm: // first make a subset of the data, to SubPoints points // then run CEM on this // then use these clusters to do a CEM on the full data float KK::Cluster() { KK KKSub; int i, d, p; //float StepSize; // for resampling int sPoints; // number of points to subset to if (Subset<=1) { // don't subset Output("--- Clustering full data set of %d points ---\n", nPoints); return CEM(NULL, 1, 1); } else { // run on a subset of points sPoints = nPoints/Subset; // number of subset points - integer division will round down // set up KKSub object KKSub.nDims = nDims; KKSub.nPoints = sPoints; KKSub.penaltyMix = PenaltyMix; KKSub.nStartingClusters = nStartingClusters; KKSub.AllocateArrays(); // fill KKSub with a subset of SubPoints from full data set. for (i=0; i<sPoints; i++) { // choose point to include, evenly spaced plus a random offset p= Subset*i + irand(0,Subset-1); // copy data for (d=0; d<nDims; d++) KKSub.Data[i*nDims + d] = Data[p*nDims + d]; } // run CEM algorithm on KKSub Output("--- Running on subset of %d points ---\n", sPoints); KKSub.CEM(NULL, 1, 1); // now copy cluster shapes from KKSub to main KK Weight = KKSub.Weight; Mean = KKSub.Mean; Cov = KKSub.Cov; ClassAlive = KKSub.ClassAlive; nClustersAlive = KKSub.nClustersAlive; AliveIndex = KKSub.AliveIndex; // Run E and C steps on full data set Output("--- Evaluating fit on full set of %d points ---\n", nPoints); EStep(); CStep(); // compute score on full data set and leave return ComputeScore(); } }
// CEM(StartFile) - Does a whole CEM algorithm from a random start // optional start file loads this cluster file to start iteration // if Recurse is 0, it will not try and split. // if InitRand is 0, use cluster assignments already in structure float KK::CEM(const mxArray *InputClass/*= NULL*/, int Recurse /*=1*/, int InitRand /*=1*/) { int p, c; int nChanged; int Iter; Array<int> OldClass(nPoints); float Score = HugeScore, OldScore; int LastStepFull; // stores whether the last step was a full one int DidSplit; if (InputClass!= NULL) LoadClu(InputClass); else if (InitRand) { // initialize data to random if (nStartingClusters>1) for(p=0; p<nPoints; p++) Class[p] = irand(1, nStartingClusters-1); else for(p=0; p<nPoints; p++) Class[p] = 0; for(c=0; c<MaxPossibleClusters; c++) ClassAlive[c] = (c<nStartingClusters); } // set all clases to alive Reindex(); // main loop Iter = 0; FullStep = 1; do { // Store old classifications for(p=0; p<nPoints; p++) OldClass[p] = Class[p]; // M-step - calculate class weights, means, and covariance matrices for each class MStep(); // E-step - calculate scores for each point to belong to each class EStep(); // dump distances if required //if (DistDump) MatPrint(Distfp, LogP.m_Data, DistDump, MaxPossibleClusters); // C-step - choose best class for each CStep(); // Would deleting any classes improve things? if(Recurse) ConsiderDeletion(); // Calculate number changed nChanged = 0; for(p=0; p<nPoints; p++) nChanged += (OldClass[p] != Class[p]); // Calculate score OldScore = Score; Score = ComputeScore(); if(Verbose>=1) { if(Recurse==0) Output("\t"); Output("Iteration %d%c: %d clusters Score %.7g nChanged %d\n", Iter, FullStep ? 'F' : 'Q', nClustersAlive, Score, nChanged); } Iter++; /* if (Debug) { for(p=0;p<nPoints;p++) BestClass[p] = Class[p]; SaveOutput(BestClass); Output("Press return"); getchar(); }*/ // Next step a full step? LastStepFull = FullStep; FullStep = ( nChanged>ChangedThresh*nPoints || nChanged == 0 || Iter%FullStepEvery==0 // || Score > OldScore Doesn't help! // Score decreases are not because of quick steps! ) ; if (Iter>MaxIter) { Output("Maximum iterations exceeded\n"); break; } // try splitting if ((Recurse && SplitEvery>0) && (Iter%SplitEvery==SplitEvery-1 || (nChanged==0 && LastStepFull))) { DidSplit = TrySplits(); } else DidSplit = 0; } while (nChanged > 0 || !LastStepFull || DidSplit); //if (DistDump) fprintf(Distfp, "\n"); return Score; }
// for each cluster, try to split it in two. if that improves the score, do it. // returns 1 if split was successful int KK::TrySplits() { int i, c, cc, c2, p, p2, d, DidSplit = 0; float Score, NewScore, UnsplitScore, SplitScore; int UnusedCluster; KK K2; // second KK structure for sub-clustering KK K3; // third one for comparison if(nClustersAlive>=MaxPossibleClusters-1) { Output("Won't try splitting - already at maximum number of clusters\n"); return 0; } // set up K3 K3.nDims = nDims; K3.nPoints = nPoints; K3.penaltyMix = PenaltyMix; K3.AllocateArrays(); for(i=0; i<nDims*nPoints; i++) K3.Data[i] = Data[i]; Score = ComputeScore(); // loop thu clusters, trying to split for (cc=1; cc<nClustersAlive; cc++) { c = AliveIndex[cc]; // set up K2 strucutre to contain points of this cluster only // count number of points and allocate memory K2.nPoints = 0; K2.penaltyMix = PenaltyMix; for(p=0; p<nPoints; p++) if(Class[p]==c) K2.nPoints++; if(K2.nPoints==0) continue; K2.nDims = nDims; K2.AllocateArrays(); K2.NoisePoint = 0; // put data into K2 p2=0; for(p=0; p<nPoints; p++) if(Class[p]==c) { for(d=0; d<nDims; d++) K2.Data[p2*nDims + d] = Data[p*nDims + d]; p2++; } // find an unused cluster UnusedCluster = -1; for(c2=1; c2<MaxPossibleClusters; c2++) { if (!ClassAlive[c2]) { UnusedCluster = c2; break; } } if (UnusedCluster==-1) { Output("No free clusters, abandoning split"); return DidSplit; } // do it if (Verbose>=1) Output("Trying to split cluster %d (%d points) \n", c, K2.nPoints); K2.nStartingClusters=2; // (2 = 1 clusters + 1 unused noise cluster) UnsplitScore = K2.CEM(NULL, 0, 1); K2.nStartingClusters=3; // (3 = 2 clusters + 1 unused noise cluster) SplitScore = K2.CEM(NULL, 0, 1); // Fix by Michaël Zugaro: replace next line with following two lines // if(SplitScore<UnsplitScore) { if(K2.nClustersAlive<2) Output("Split failed - leaving alone\n"); if(SplitScore<UnsplitScore&&K2.nClustersAlive>=2) { // will splitting improve the score in the whole data set? // assign clusters to K3 for(c2=0; c2<MaxPossibleClusters; c2++) K3.ClassAlive[c2]=0; p2 = 0; for(p=0; p<nPoints; p++) { if(Class[p]==c) { if(K2.Class[p2]==1) K3.Class[p] = c; else if(K2.Class[p2]==2) K3.Class[p] = UnusedCluster; else Error("split should only produce 2 clusters"); p2++; } else K3.Class[p] = Class[p]; K3.ClassAlive[K3.Class[p]] = 1; } K3.Reindex(); // compute scores K3.MStep(); K3.EStep(); NewScore = K3.ComputeScore(); Output("Splitting cluster %d changes total score from %f to %f\n", c, Score, NewScore); if (NewScore<Score) { DidSplit = 1; Output("So it's getting split into cluster %d.\n", UnusedCluster); // so put clusters from K3 back into main KK struct (K1) for(c2=0; c2<MaxPossibleClusters; c2++) ClassAlive[c2] = K3.ClassAlive[c2]; for(p=0; p<nPoints; p++) Class[p] = K3.Class[p]; } else { Output("So it's not getting split.\n"); } } } return DidSplit; }
//All the *_grad is the gradient of pos_score - neg_score w.r.t. the parameter * void CRelation::TrainRelatTriple(int head, bool head_is_word, int r, int tail) { real head_grads[MAX_EMBEDDING_SIZE], tail_grads[MAX_EMBEDDING_SIZE], grads_tmp[MAX_EMBEDDING_SIZE], relat_grads[MAX_EMBEDDING_SIZE], negh_grads[MAX_EMBEDDING_SIZE], negt_grads[MAX_EMBEDDING_SIZE], negr_grads[MAX_EMBEDDING_SIZE]; real head_left_mat_grads[MAX_EMBEDDING_SIZE * MAX_RELAT_RANK], head_right_mat_grads[MAX_EMBEDDING_SIZE * MAX_RELAT_RANK], tail_left_mat_grads[MAX_EMBEDDING_SIZE * MAX_RELAT_RANK], tail_right_mat_grads[MAX_EMBEDDING_SIZE * MAX_RELAT_RANK]; real negr_head_left_mat_grads[MAX_EMBEDDING_SIZE * MAX_RELAT_RANK], negr_head_right_mat_grads[MAX_EMBEDDING_SIZE * MAX_RELAT_RANK], negr_tail_left_mat_grads[MAX_EMBEDDING_SIZE * MAX_RELAT_RANK], negr_tail_right_mat_grads[MAX_EMBEDDING_SIZE * MAX_RELAT_RANK]; real head_mat_grads[MAX_RELAT_RANK], tail_mat_grads[MAX_RELAT_RANK]; real negr_head_mat_grads[MAX_RELAT_RANK], negr_tail_mat_grads[MAX_RELAT_RANK]; //Used for the diag case real off_vec[MAX_EMBEDDING_SIZE], neg_off_vec[MAX_EMBEDDING_SIZE]; real Qh_h[MAX_RELAT_RANK], Qh_negh[MAX_RELAT_RANK], Qt_t[MAX_RELAT_RANK], Qt_negt[MAX_RELAT_RANK]; real PhT_offvec[MAX_RELAT_RANK], PtT_offvec[MAX_RELAT_RANK]; memset(head_grads, 0, sizeof(real)* Opt::embeding_size); memset(tail_grads, 0, sizeof(real)* Opt::embeding_size); memset(relat_grads, 0, sizeof(real)* Opt::embeding_size); memset(negh_grads, 0, sizeof(real)* Opt::embeding_size); memset(negt_grads, 0, sizeof(real)* Opt::embeding_size); memset(negr_grads, 0, sizeof(real)* Opt::embeding_size); if (Opt::update_mat && !Opt::is_diag) { memset(head_left_mat_grads, 0, sizeof(real)* Opt::head_relat_rank * Opt::embeding_size); memset(head_right_mat_grads, 0, sizeof(real)* Opt::head_relat_rank * Opt::embeding_size); if (Opt::use_tail_mat) { memset(tail_left_mat_grads, 0, sizeof(real)* Opt::embeding_size * Opt::tail_relat_rank); memset(tail_right_mat_grads, 0, sizeof(real)* Opt::tail_relat_rank * Opt::embeding_size); } memset(negr_head_left_mat_grads, 0, sizeof(real)* Opt::embeding_size * Opt::head_relat_rank); memset(negr_head_right_mat_grads, 0, sizeof(real)* Opt::embeding_size * Opt::head_relat_rank); if (Opt::use_tail_mat) { memset(negr_tail_left_mat_grads, 0, sizeof(real)* Opt::embeding_size * Opt::tail_relat_rank); memset(negr_tail_right_mat_grads, 0, sizeof(real)* Opt::embeding_size * Opt::tail_relat_rank); } } if (Opt::update_mat && Opt::is_diag) { memset(head_mat_grads, 0, sizeof(real)* Opt::embeding_size); memset(tail_mat_grads, 0, sizeof(real)* Opt::embeding_size); memset(negr_head_mat_grads, 0, sizeof(real)* Opt::embeding_size); memset(negr_tail_mat_grads, 0, sizeof(real)* Opt::embeding_size); } //Sample neg head word and neg tail word int neg_head, neg_tail, neg_r; do { neg_head = SampleWordIdx(); } while (neg_head == head || neg_head == tail); do { neg_tail = SampleWordIdx(); } while (neg_tail == head || neg_tail == tail); do { neg_r = SampleRelatIdx(); } while (neg_r == r); real* head_embedding = WordParams::p_embedding[head]; real* tail_embedding = WordParams::p_embedding[tail]; real* relat_embedding = p_relat_emb[r]; real* relat_act_embedding = Opt::act_relat ? p_relat_act_emb[r] : NULL; real* neg_head_embedding = WordParams::p_embedding[neg_head]; real* neg_tail_embedding = WordParams::p_embedding[neg_tail]; real* neg_r_embedding = p_relat_emb[neg_r]; real* neg_r_act_embedding = Opt::act_relat ? p_relat_act_emb[neg_r] : NULL; real pos_score = ComputeScore(head, r, tail, Qh_h, Qt_t, off_vec); real negh_score = ComputeScore(neg_head, r, tail, Qh_negh, Qt_t, neg_off_vec); real hgap; bool is_negh_margin_satisfied; hgap = negh_score - pos_score; is_negh_margin_satisfied = hgap > Opt::margin; if (!is_negh_margin_satisfied) //margin is not satisfied ComputeGradient(1, Opt::relat_neg_weight, r, grads_tmp, negh_grads, tail_grads, relat_grads, head_left_mat_grads, head_right_mat_grads, tail_left_mat_grads, tail_right_mat_grads, neg_off_vec, Qh_negh, Qt_t, PhT_offvec, PtT_offvec, neg_head_embedding, tail_embedding, head_mat_grads, tail_mat_grads); //Begin updating the loss and grads for ||LR(h - t)||_2^2 - ||LR(h - negt)||_2^2 real negt_score = ComputeScore(head, r, neg_tail, Qh_h, Qt_negt, neg_off_vec); real tgap; bool is_negt_margin_satisfied; tgap = negt_score - pos_score; is_negt_margin_satisfied = tgap > Opt::margin; if (!is_negt_margin_satisfied) ComputeGradient(1, Opt::relat_neg_weight, r, grads_tmp, head_grads, negt_grads, relat_grads, head_left_mat_grads, head_right_mat_grads, tail_left_mat_grads, tail_right_mat_grads, neg_off_vec, Qh_h, Qt_negt, PhT_offvec, PtT_offvec, head_embedding, neg_tail_embedding, head_mat_grads, tail_mat_grads); real negr_score = ComputeScore(head, neg_r, tail, Qh_negh, Qt_negt, neg_off_vec); real rgap; bool is_negr_margin_satisfied = true;; rgap = negr_score - pos_score; is_negr_margin_satisfied = rgap > Opt::margin; if (!is_negr_margin_satisfied) ComputeGradient(1, Opt::relat_neg_weight, neg_r, grads_tmp, head_grads, tail_grads, negr_grads, negr_head_left_mat_grads, negr_head_right_mat_grads, negr_tail_left_mat_grads, negr_tail_right_mat_grads, neg_off_vec, Qh_negh, Qt_negt, PhT_offvec, PtT_offvec, head_embedding, tail_embedding, negr_head_mat_grads, negr_tail_mat_grads); if (!Opt::sig_relat && is_negh_margin_satisfied && is_negt_margin_satisfied && is_negr_margin_satisfied) return; int effect_cnt = (is_negh_margin_satisfied ? 0 : 1) + (is_negt_margin_satisfied ? 0 : 1) + (is_negr_margin_satisfied ? 0 : 1); ComputeGradient(-1, effect_cnt, r, grads_tmp, head_grads, tail_grads, relat_grads, head_left_mat_grads, head_right_mat_grads, tail_left_mat_grads, tail_right_mat_grads, off_vec, Qh_h, Qt_t, PhT_offvec, PtT_offvec, head_embedding, tail_embedding, head_mat_grads, tail_mat_grads); //Gradient Checking /*real gap = (is_negh_margin_satisfied ? 0 : negh_score - pos_score) + (is_negt_margin_satisfied ? 0 : negt_score - pos_score) + (is_negr_margin_satisfied ? 0 : negr_score - pos_score); if (rand() < 20) { const double epsilon = 1e-6; //head_embedding[idx] += epsilon; //p_head_left_mat[r][idx] += epsilon; //p_tail_right_mat[r][idx] += epsilon; //tail_embedding[idx] += epsilon; //p_actual_left_mat[r][idx] = 2 * Util::Sigmoid(p_left_mat[r][41]) - 1; //p_tail_left_mat[r][idx] += epsilon; //p_head_left_mat[neg_r][idx] += epsilon; //p_actual_right_mat[r][idx] = 2 * Util::Sigmoid(p_right_mat[r][idx]) - 1; //p_relat_emb[neg_r][idx] += epsilon; //p_relat_act_emb[neg_r][idx] = 2 * Util::Sigmoid(p_relat_emb[neg_r][idx]) - 1; idx = -1; for (auto x : tail_diag_mat_ele[r]) idx = x.first; printf("\n"); tail_diag_mat_ele[neg_r][idx] += epsilon; real new_gap = 0, pos_score = ComputeLoss(head, tail, r); //ComputeLoss(head, tail, neg_r) - ComputeLoss(head, tail, r); real neg_gap = ComputeLoss(neg_head, tail, r) - pos_score; if (neg_gap <= Opt::margin) new_gap += neg_gap; neg_gap = ComputeLoss(head, neg_tail, r) - pos_score; if (neg_gap <= Opt::margin) new_gap += neg_gap; neg_gap = ComputeLoss(head, tail, neg_r) - pos_score; if (neg_gap <= Opt::margin) new_gap += neg_gap; printf("real gradient: %.5f, our gradient %.5f, idx:%d\n", (new_gap - gap) / epsilon, negr_tail_mat_grads[idx], idx); //p_tail_right_mat[r][idx] -= epsilon; //p_head_left_mat[neg_r][idx] -= epsilon; //tail_embedding[idx] -= epsilon; //p_actual_right_mat[r][idx] = 2 * Util::Sigmoid(p_right_mat[r][idx]) - 1; //head_embedding[idx] -= epsilon; //p_relat_emb[neg_r][idx] -= epsilon; //p_relat_act_emb[neg_r][idx] = 2 * Util::Sigmoid(p_relat_emb[neg_r][idx]) - 1; //p_actual_left_mat[neg_r][idx] = 2 * Util::Sigmoid(p_left_mat[r][41]) - 1; tail_diag_mat_ele[neg_r][idx] -= epsilon; }*/ double step_size = GetStepSize(head, r, tail); step_size /= effect_cnt; Util::MatPlusMat(relat_embedding, relat_grads, step_size, Opt::embeding_size, 1); Util::MatPlusMat(head_embedding, head_grads, step_size, Opt::embeding_size, 1); Util::MatPlusMat(tail_embedding, tail_grads, step_size, Opt::embeding_size, 1); if (!is_negt_margin_satisfied) Util::MatPlusMat(neg_tail_embedding, negt_grads, step_size, Opt::embeding_size, 1); if (!is_negh_margin_satisfied) Util::MatPlusMat(neg_head_embedding, negh_grads, step_size, Opt::embeding_size, 1); if (!is_negr_margin_satisfied) Util::MatPlusMat(neg_r_embedding, negr_grads, step_size, Opt::embeding_size, 1); if (Opt::update_mat && !Opt::is_diag) { Util::MatPlusMat(p_head_left_mat[r], head_left_mat_grads, step_size, Opt::embeding_size, Opt::head_relat_rank); Util::MatPlusMat(p_head_right_mat[r], head_right_mat_grads, step_size, Opt::head_relat_rank, Opt::embeding_size); if (Opt::use_tail_mat) { Util::MatPlusMat(p_tail_left_mat[r], tail_left_mat_grads, step_size, Opt::embeding_size, Opt::tail_relat_rank); Util::MatPlusMat(p_tail_right_mat[r], tail_right_mat_grads, step_size, Opt::tail_relat_rank, Opt::embeding_size); } if (!is_negr_margin_satisfied) { Util::MatPlusMat(p_head_left_mat[neg_r], negr_head_left_mat_grads, step_size, Opt::embeding_size, Opt::head_relat_rank); Util::MatPlusMat(p_head_right_mat[neg_r], negr_head_right_mat_grads, step_size, Opt::embeding_size, Opt::head_relat_rank); if (Opt::use_tail_mat) { Util::MatPlusMat(p_tail_left_mat[neg_r], negr_tail_left_mat_grads, step_size, Opt::embeding_size, Opt::tail_relat_rank); Util::MatPlusMat(p_tail_right_mat[neg_r], negr_tail_right_mat_grads, step_size, Opt::embeding_size, Opt::tail_relat_rank); } } } else if (Opt::update_mat && Opt::is_diag) { for (auto x : head_diag_mat_ele[r]) head_diag_mat_ele[r][x.first] += step_size * head_mat_grads[x.first]; if (Opt::use_tail_mat) for (auto x : tail_diag_mat_ele[r]) tail_diag_mat_ele[r][x.first] += step_size * tail_mat_grads[x.first]; if (!is_negr_margin_satisfied) { for (auto x : head_diag_mat_ele[neg_r]) head_diag_mat_ele[neg_r][x.first] += step_size * negr_head_mat_grads[x.first]; if (Opt::use_tail_mat) for (auto x : tail_diag_mat_ele[neg_r]) tail_diag_mat_ele[neg_r][x.first] += step_size * negr_tail_mat_grads[x.first]; } } ConstrainParameters(r); if (!is_negr_margin_satisfied) ConstrainParameters(neg_r); }
int main(int argc, char* argv[]){ //call the check args function to check the input arguments checkArgs(argc, argv); //init the HashTable HashTable* Table = ReadFile(argv[1]); //init the array to hold all of the input words char wordArray[MAX_ROWS][MAX_ROWS][MAX_WORD_LENGTH + 1]; //init keyboard input string char line[MAX_WORD_LENGTH+1]; while (1){ //main loop printf("\nEnter your string (enter \"QUIT\" to exit the function) \n"); //accept user input. Deal with user input longer than the max line if (fgets(line, MAX_LINE, stdin)){ if (NULL == strchr(line, '\n')){ printf("Query only accepts 1000 characters\n"); eat_extra(); //"eats" characters after 1000 characters are input then exits exit(1); } } //handle when the user quits the program if (strcmp(line, "QUIT\n") == 0){ printf("Exit command reached, Cleaning memory and quitting\n"); CleanHashMemory(Table); exit(0); } // size_t length = strlen(line); // printf("length of input is %zu\n", length ); //check if the inputted line ends with AND or OR EndsWithAND(line); EndsWithOR(line); char* argv2 = argv[2]; //make sure the wordArray is cleared out between queries memset(wordArray, 0, sizeof(wordArray[0][0][0]) * 500 * MAX_ROWS * MAX_WORD_LENGTH + 1); int FinalDocMatchArray[1705] = {0}; //keep the documents ids that have matched all the criteria int FinalArrayIndex = 0; int scoreArray[1705] = {0}; //keep the scores of the FinalDocMatchArray in parallel positions int index = 0; //init variables for GetNextWord int pos = 0; int counter = 0; int andPos = 0; int andFlag = 0; int orFlag = 0; int orPos = 0; char* word; while((pos = GetNextWord(line, pos, &word)) > 0){ //go through the words in the query //if the word exists, add it to the hash table if (word != NULL && strlen(word) < MAX_WORD_LENGTH) { //check if it starts with AND or OR if (counter == 0 && (strcmp(word, "AND") == 0 || strcmp(word, "OR") == 0)){ printf("Input cannot start or end with AND or OR\n"); exit(1); } else if (strcmp(word, "AND") == 0){ // printf("AND detected\n"); if (andFlag == 1) { printf("Two ANDs in a row. Invalid input.\n"); exit(1); } andFlag = 1; } //detect ORs and increment position in wordArray else if (strcmp(word, "OR") == 0){ // printf("OR detected\n"); if (orFlag == 1) { printf("Two ORs in a row. Invalid input.\n"); exit(1); } orPos++; andPos = 0; orFlag = 1; } else{ NormalizeWord(word); // printf("Word is %s %i\n", word, counter); andFlag = 0; orFlag = 0; //put the word in the wordArray at the appropriate place int len = strlen(word+1); char wordCpy[len+1]; strcpy(wordCpy,word); strcpy(wordArray[andPos][orPos], wordCpy); // printf("Adding %s to array at %i %i \n",word, andPos, orPos ); andPos++; } counter++; } free(word); word = NULL; } //k is incremented every time an OR is processed int k = 0; while (strcmp(wordArray[0][k], "") != 0){ int docMatchArray[1705] = {0}; //temporary array of matching documents int docMatchArrayIndex = 0; char* firstWord = wordArray[0][k]; // printf("Word is: %s\n", firstWord); //compute jenkins hash int hashResult = JenkinsHash(firstWord, MAX_HASH_SLOT); if (Table->table[hashResult] == NULL){ printf("%s does not exist in hashTable database\n", firstWord ); exit(1); } //go through the hashtable until you find the appropriate word and documents //put it into a temporary array to be matched against else{ WordNode* node2 = Table->table[hashResult]; WordNode* dummyWord = node2; while (dummyWord != NULL){ //go through all the linked words DocumentNode *dummy_doc = dummyWord->page; if (strcmp(dummyWord->word, firstWord) == 0){//if they are the same word, go through the document nodes //go through the document nodes while (dummy_doc != NULL) { //put all of the first words docs into the temp list docMatchArray[docMatchArrayIndex] = dummy_doc->doc_id; docMatchArrayIndex++; //advance dummy_doc = dummy_doc->next; } break; //you've found the word, no need to continue to other words } else{ // printf("Did not find %s\n", firstWord ); } dummyWord = dummyWord->next; // printf("Advancing\n"); } } //if there's only 1 word to examine, no need to compare other words if (strcmp(wordArray[1][k], "") == 0){ //add everything in the doc match array to the FinalDocMatchArray for (int i = 0; i < docMatchArrayIndex; i ++ ){ if (docMatchArray[i] != '\0'){ int dupIndex = 0; int dupFlag = 0; while (FinalDocMatchArray[dupIndex] != '\0'){ //check if they're the same if(docMatchArray[i] == FinalDocMatchArray[dupIndex]){ // printf("FOUND A DUPLICATE for %i\n", docMatchArray[i] ); dupFlag = 1; //a duplicate was found, compute the final score and increment that element int finalScore = 0; int index3=0; // printf("docNum is %i\n",FinalDocMatchArray[index]); while(strcmp(wordArray[index3][k],"") != 0){ //for every word //go through all the words and compute the final score finalScore += ComputeScore(FinalDocMatchArray[dupIndex], Table, wordArray[index3][k]); index3++; } //put it in the score array scoreArray[dupIndex] += finalScore; finalScore = 0; break; } dupIndex++; } //if the duplicate was not found and there's only 1 word, then put everything into the final array if (dupFlag != 1) { //if a duplicate was not found in the list FinalDocMatchArray[FinalArrayIndex] = docMatchArray[i]; FinalArrayIndex++; } } } } //if there's more than one word between the OR statements, compute the final scores for them all else{ for (int i = 0; i < 1705; i ++){ if (docMatchArray[i] != 0){ int result = 1; int m = 0;//make sure to adjust based on current position in masterList //for every doc in the docMatchArray, test if all other words contain that doc while (strcmp(wordArray[m][k], "") != 0) { //increment word //check if this word's documents and see if there's a match result = findDocMatch(docMatchArray[i], Table, wordArray[m][k]); if (result != 0){ break; //the document had no matches, skip the rest } m++; } if (result == 0){ //before you add it to the final array, check if you've already added it int dupIndex2 = 0; int dupFlag2 = 0; while (FinalDocMatchArray[dupIndex2] != '\0'){ //if it's already in the list, then only increment the score if(docMatchArray[i] == FinalDocMatchArray[dupIndex2]){ dupFlag2 = 1; int finalScore2 = 0; int index4 = 0; while(strcmp(wordArray[index4][k],"") != 0){//for every word // printf("Word is %s\n",wordArray[index4][k]); finalScore2 += ComputeScore(FinalDocMatchArray[dupIndex2], Table, wordArray[index4][k]); index4++; } scoreArray[dupIndex2] += finalScore2; //increment the appropriate score finalScore2 = 0; break; } dupIndex2++; } //otherwise, add it to end of the Final Array if (dupFlag2 != 1){ FinalDocMatchArray[FinalArrayIndex] = docMatchArray[i]; FinalArrayIndex++; } } } } } //compute the scores for all the non-duplicates toward the end of the array int finalScore = 0; while (FinalDocMatchArray[index] != '\0'){ //for every doc that matches all AND words int index2=0; // printf("docNum is %i\n",FinalDocMatchArray[index]); while(strcmp(wordArray[index2][k],"") != 0){//for every word // printf("Word is %s\n",wordArray[index2][k]); finalScore += ComputeScore(FinalDocMatchArray[index], Table, wordArray[index2][k]); index2++; } // printf("Score for %i is %i\n",FinalDocMatchArray[index], finalScore); //put it in the score array scoreArray[index] = finalScore; finalScore = 0; index++; } k++; //increment OR position } //sort the Final Array BubbleSort(FinalDocMatchArray, scoreArray, argv2); }//loop back to string entry } //end main