void DiffHelper::longest_common(ptrdiff_t l1, ptrdiff_t l2, ptrdiff_t r1, ptrdiff_t r2) { for (; l1 < l2 && r1 < r2 && match(l1, r1); ++l1, ++r1) {} for (; l1 < l2 && r1 < r2 && match(l2 - 1, r2 - 1); --l2, --r2) {} if (l1 == l2) { std::fill(rchanges.begin() + r1, rchanges.begin() + r2, true); return; } if (r1 == r2) { std::fill(lchanges.begin() + l1, lchanges.begin() + l2, true); return; } fwd[1] = l1; rev[-1] = l2; ptrdiff_t d1 = l1 - r1, d2 = l2 - r2, delta = d2 - d1; for (ptrdiff_t d = 0;; ++d) { for (ptrdiff_t k = - d; k <= d; k = k + 2) { ptrdiff_t i = k == - d ? fwd[k + 1] : fwd[k - 1] + 1; if (k < d) i = std::max(i, fwd[k + 1]); for (ptrdiff_t j = i - k - d1; i < l2 && j < r2 && match(i, j); ++i, ++j) {} fwd[k] = i; if ((delta & 1) && k > delta - d && k < delta + d && rev[k - delta] <= fwd[k]) { longest_common(l1, fwd[k], r1, fwd[k] - k - d1); longest_common(fwd[k], l2, fwd[k] - k - d1, r2); return; } } for (ptrdiff_t k = - d; k <= d; k = k + 2) { ptrdiff_t i = k == d ? rev[k - 1] : rev[k + 1] - 1; if (k > - d) i = std::min(i, rev[k - 1]); for (ptrdiff_t j = i - k - d2; i > l1 && j > r1 && match(i - 1, j - 1); --i, --j) {} rev[k] = i; if (! (delta & 1) && k >= - d - delta && k <= d - delta && rev[k] <= fwd[k + delta]) { longest_common(l1, fwd[k + delta], r1, fwd[k + delta] - k - d2); longest_common(fwd[k + delta], l2, fwd[k + delta] - k - d2, r2); return; } } } }
/** * @brief check to make sure the copyright has been created * * will attempt to access the copyright table, if the response from the database * indicates that the copyright table does not exist, this will also attempt to * create the table for future use. * * @param pgConn the connection to the database * @return 1 if the table exists at the end of the function, 0 otherwise */ int check_copyright_table(PGconn* pgConn) { /* local variables */ PGresult* pgResult; // the result of the database access int ret; // the value returned by this function char* str; // the string error message if the database access fails char buffer[256]; // a buffer used for string manipulation /* initialize memory and do the sql access */ ret = 1; str = NULL; memset(buffer, '\0', sizeof(buffer)); pgResult = PQexec(pgConn, check_database_table); /* check if the database already exists */ if(PQresultStatus(pgResult) != PGRES_TUPLES_OK) { str = PQresultErrorMessage(pgResult); if(longest_common(buffer, str, "does not exist") == 14) { fprintf(cerr, "WARNING %s.%d: Could not find copyright table.", __FILE__, __LINE__); ret = setup_database(pgConn); } else { fprintf(cerr, "ERROR %s.%d: problem with copyright table\n", __FILE__, __LINE__); fprintf(cerr, "ERROR PQ error message: %s\n", PQresultErrorMessage(pgResult)); fprintf(cerr, "ERROR sql was: %s\n", check_database_table); ret = 0; } //free(str); } else { /* check if the copyright foreign key exsits */ pgResult = PQexec(pgConn, check_copyright_foreign_key); if(PQntuples(pgResult) != 1) { ret = cleanup_copyright(pgConn); } } /* check if the copyright exsits */ pgResult = PQexec(pgConn, check_copyright_ars); if(PQresultStatus(pgResult) != PGRES_TUPLES_OK && PQntuples(pgResult) != 1) { fo_CreateARSTable(pgConn, AGENT_ARS); } /* clean up memory and return */ PQclear(pgResult); return ret; }
/** * @brief runs the labeled test files to determine accuracy * * This function will open each pair of files in the testdata directory to * analyze how accurate the copyright agent is. This function will respond with * the number of false negatives, false positives, and correct answers for each * file and total tally of these numbers. This will also produce 3 files, one * containing all matches that the copyright agent found, all the things that it * didn't find, and all of the false positives. */ void run_test_files(copyright copy) { /* locals */ cvector compare; copyright_iterator iter; cvector_iterator curr; FILE* istr, * m_out, * n_out, * p_out; char buffer[READMAX + 1]; char file_name[FILENAME_MAX]; char copy_buf[FILENAME_MAX]; char name_buf[FILENAME_MAX]; char* first, * last, * loc, tmp; int i, matches, correct = 0, falsep = 0, falsen = 0; /* grab the copyright files */ memset(copy_buf, '\0', sizeof(copy_buf)); memset(name_buf, '\0', sizeof(copy_buf)); snprintf(copy_buf, sizeof(copy_buf), "%s/mods-enabled/copyright/agent/copyright.dic", sysconfigdir); snprintf(name_buf, sizeof(name_buf), "%s/mods-enabled/copyright/agent/names.dic", sysconfigdir); /* create data structures */ copyright_init(©, copy_buf, name_buf); cvector_init(&compare, string_function_registry()); /* open the logging files */ m_out = fopen("Matches", "w"); n_out = fopen("False_Negatives", "w"); p_out = fopen("False_Positives", "w"); /* big problem if any of the log files didn't open correctly */ if(!m_out || !n_out || !p_out) { fprintf(cerr, "ERROR did not successfully open one of the log files\n"); fprintf(cerr, "ERROR the files that needed to be opened were:\n"); fprintf(cerr, "ERROR Matches, False_Positives, False_Negatives\n"); exit(-1); } /* loop over every file in the test directory */ for(i = 0; i < TESTFILE_NUMBER; i++) { sprintf(file_name, "%s%d_raw", test_dir, i); /* attempt to open the labeled test file */ istr = fopen(file_name, "r"); if(!istr) { fprintf(cerr, "ERROR Must run testing from correct directory. The\n"); fprintf(cerr, "ERROR correct directory is installation dependent but\n"); fprintf(cerr, "ERROR the working directory should include the folder:\n"); fprintf(cerr, "ERROR %s\n", test_dir); exit(-1); } /* initialize the buffer and read in any information */ memset(buffer, '\0', sizeof(buffer)); buffer[fread(buffer, sizeof(char), READMAX, istr)] = '\0'; matches = 0; /* set everything in the buffer to lower case */ for(first = buffer; *first; first++) { *first = tolower(*first); } /* loop through and find all <s>...</s> tags */ loc = buffer; while((first = strstr(loc, "<s>")) != NULL) { last = strstr(loc, "</s>"); if(last == NULL) { fprintf(cerr, "ERROR unmatched \"<s>\"\n"); fprintf(cerr, "ERROR in file: \"%s\"\n", file_name); exit(-1); } if(last <= first) { fprintf(cerr, "ERROR unmatched \"</s>\"\n"); fprintf(cerr, "ERROR in file: \"%s\"\n", file_name); exit(-1); } tmp = *last; *last = 0; cvector_push_back(compare, first + 3); *last = tmp; loc = last + 4; } /* close the previous file and open the corresponding raw data */ fclose(istr); file_name[strlen(file_name) - 4] = '\0'; istr = fopen(file_name, "r"); if(!istr) { fprintf(cerr, "ERROR Unmatched file in the test directory"); fprintf(cerr, "ERROR File with no match: \"%s\"_raw\n", file_name); fprintf(cerr, "ERROR File that caused error: \"%s\"\n", file_name); } /* perform the analysis on the current file */ copyright_analyze(copy, istr, REPORTALL); fclose(istr); /* loop over every match that the copyright object found */ for(iter = copyright_begin(copy); iter != copyright_end(copy); iter++) { cvector_iterator best = cvector_begin(compare); char score[2048]; char dst[2048]; memset(dst, '\0', sizeof(dst)); memset(score, '\0', sizeof(score)); /* log the coyright entry */ fprintf(m_out, "====%s================================\n", file_name); fprintf(m_out, "DICT: %s\tNAME: %s\n",copy_entry_dict(*iter), copy_entry_name(*iter)); fprintf(m_out, "TEXT[%s]\n",copy_entry_text(*iter)); /* loop over the vector looking for matches */ for(curr = cvector_begin(compare); curr != cvector_end(compare); curr++) { if(longest_common(dst, copy_entry_text(*iter), (char*)*curr) > strlen(score)) { strcpy(score, dst); best = curr; } } /* log the entry as found if it matched something in compare */ if(cvector_size(compare) != 0 && (strcmp(copy_entry_dict(*iter), "by") || strlen(score) > THRESHOLD)) { cvector_remove(compare, best); matches++; } else if(!strcmp(copy_entry_dict(*iter), "email") || !strcmp(copy_entry_dict(*iter), "url")) { matches++; } else { fprintf(p_out, "====%s================================\n", file_name); fprintf(p_out, "DICT: %s\tNAME: %s\n",copy_entry_dict(*iter), copy_entry_name(*iter)); fprintf(p_out, "TEXT[%s]\n",copy_entry_text(*iter)); } } /* log all the false negatives */ for(curr = cvector_begin(compare); curr != cvector_end(compare); curr++) { fprintf(n_out, "====%s================================\n", file_name); fprintf(n_out, "%s\n", (char*)*curr); } fprintf(cout, "====%s================================\n", file_name); fprintf(cout, "Correct: %d\n", matches); fprintf(cout, "False Positives: %d\n", copyright_size(copy) - matches); fprintf(cout, "False Negatives: %d\n", cvector_size(compare)); /* clean up for the next file */ correct += matches; falsep += copyright_size(copy) - matches; falsen += cvector_size(compare); cvector_clear(compare); } fprintf(cout, "==== Totals ================================\n"); fprintf(cout, "Total Found: %d\n", correct + falsep); fprintf(cout, "Correct: %d\n", correct); fprintf(cout, "False Positives: %d\n", falsep); fprintf(cout, "False Negatives: %d\n", falsen); fclose(m_out); fclose(n_out); fclose(p_out); copyright_destroy(copy); cvector_destroy(compare); }