void test_string_function_registry() { function_registry* fr = string_function_registry(); char* tester = "hello"; /* start the tests */ printf("Test string_function_registry: "); CU_ASSERT_TRUE(!strcmp(fr->name, "string")); /* test the copy function */ char* cpy = (char*)fr->copy(tester); CU_ASSERT_TRUE(!strcmp(cpy, tester)); /* test to be sure that it is actually a copy */ tester = "world"; CU_ASSERT_FALSE(!strcmp(cpy, tester)); /* free memory */ fr->destroy(cpy); free(fr); /* finish the test */ test_failure(); printf("\n"); }
/** * @brief main function for the copyright agent * * The copyright agent is used to automatically locate copyright statements * found in code. * * There are 3 ways to use the copyright agent: * 1. Command Line Analysis :: test a file from the command line * 2. Agent Based Analysis :: waits for commands from stdin * 3. Accuracy Test :: tests the accuracy of the copyright agent * * +-----------------------+ * | Command Line Analysis | * +-----------------------+ * * To analyze a file from the command line: * -C <filename> :: run copyright agent from command line * -d :: turn on debugging information * -T <Copyright Statements | URLs| Emails> :: Copyright Statements | URLs |Emails * * example: * $ ./copyright -C myfiletoscan * * +----------------------+ * | Agent Based Analysis | * +----------------------+ * * To run the copyright agent as an agent simply run with no command line args * -i :: initialize a connection to the database * -d :: turn on debugging information * * example: * $ upload_pk | ./copyright * * +---------------+ * | Accuracy Test | * +---------------+ * * To test the accuracy of the copyright agent run with a -t. Make sure to run the * accuracy tests in the source directory with the testdata directory: * -t :: run the accuracy analysis * * example: * $ ./copyright -t * * Running the tests will create 3 files: * 1. Matches: contains all of the matches found by the copyright agent, information * includes what file the match was found in, the dictionary element * that it matched, the name that it matched and the text that was found * 2. False_Positives: contains all of the false positives found by the agent, * information in the file includes the file the false positive was * in, the dictionary match, the name match, and the text * 3. Flase_Negatives: contains all of the false negatives found by the agent, * information in the file includes the file the false negative was * in, and the text of the false negative * * NOTE: -d will produces the exact same style of Matches file that the accuracy * testing does. Currently this is the only thing that -d will produce * * @param argc the number of command line arguments * @param argv the command line arguments * @return 0 on a successful program execution */ int main(int argc, char** argv) { /* primitives */ char sql[512]; // buffer for database access int c, i = -1; // temporary int containers int num_files = 0; // the number of rows in a job int ars_pk = 0; // the args primary key int user_pk = 0; long upload_pk = 0; // the upload primary key long agent_pk = 0; // the agents primary key char *SVN_REV = NULL; char *VERSION = NULL; char agent_rev[myBUFSIZ]; char copy_buf[FILENAME_MAX]; char name_buf[FILENAME_MAX]; int report_type = 7; // defaul as all. binary xxx 1st number as email, 2nd number as url, 3rd number as statement int cli_run = 0; // when run from command line, that mean -C option is set; 1: yes, 0: no /* Database structs */ PGconn* pgConn = NULL; // the connection to Database PGresult* pgResult = NULL; // result of a database access /* copyright structs */ copyright copy; // the work horse of the copyright agent pair curr; // pair to push into the file list /* verbose data */ FILE* mout = NULL; /* set the output streams */ cout = stdout; cerr = stdout; cin = stdin; /* connect to the scheduler */ fo_scheduler_connect(&argc, argv, &pgConn); /* initialize complex data strcutres */ memset(copy_buf, '\0', sizeof(copy_buf)); memset(name_buf, '\0', sizeof(copy_buf)); snprintf(copy_buf, sizeof(copy_buf), "%s/mods-enabled/copyright/agent/copyright.dic", sysconfigdir); snprintf(name_buf, sizeof(name_buf), "%s/mods-enabled/copyright/agent/names.dic", sysconfigdir); if(!copyright_init(©, copy_buf, name_buf)) { fprintf(cerr, "FATAL %s.%d: copyright initialization failed\n", __FILE__, __LINE__); fprintf(cerr, "FATAL %s\n", strerror(errno)); fflush(cerr); return 1; } /* parse the command line options */ while((c = getopt(argc, argv, "T:dc:C:tiVvh")) != -1) { switch(c) { case 'v': /* debugging */ mout = fopen("Matches", "w"); if(!mout) { fprintf(cerr, "ERROR could not open Matches for logging\n"); fflush(cerr); } else { verbose = 1; } break; case 'C': /* run from command line */ cli_run = 1; pair_init(&curr, string_function_registry(), int_function_registry()); pair_set_first(curr, optarg); pair_set_second(curr, &i); num_files++; break; case 'T': /* report type, Copyright Statements | URLs| Emails */ report_type = atoi(optarg); printf("report_type is:%d\n", report_type); break; case 't': /* run accuracy testing */ run_test_files(copy); copyright_destroy(copy); return 0; case 'i': /* initialize database connections */ copyright_destroy(copy); PQfinish(pgConn); return 0; case 'V': printf("%s", BuildVersion); copyright_destroy(copy); PQfinish(pgConn); return 0; default: /* error, print usage */ copyright_usage(argv[0]); return 3; } } /** run from command line */ if (1 == cli_run) { perform_analysis(pgConn, copy, curr, agent_pk, mout, report_type); pair_destroy(curr); } /* if there are no files in the file list then the agent is begin run from */ /* the scheduler, open the database and grab the files to be analyzed */ if(num_files == 0) { /* create the sql copy structure */ sqlcpy = fo_sqlCopyCreate(pgConn, "copyright", 32768, 7, "agent_fk", "pfile_fk", "copy_startbyte", "copy_endbyte", "content", "hash", "type"); /* book keeping */ pair_init(&curr, string_function_registry(), int_function_registry()); db_connected = 1; SVN_REV = fo_sysconfig("copyright", "SVN_REV"); VERSION = fo_sysconfig("copyright", "VERSION"); sprintf(agent_rev, "%s.%s", VERSION, SVN_REV); agent_pk = fo_GetAgentKey(pgConn, AGENT_NAME, 0, agent_rev, AGENT_DESC); /* make sure that we are connected to the database */ if(!check_copyright_table(pgConn)) { return 5; } user_pk = fo_scheduler_userID(); /* get user_pk for user who queued the agent */ /* enter the main agent loop */ while(fo_scheduler_next()) { upload_pk = atol(fo_scheduler_current()); /* Check Permissions */ if (GetUploadPerm(pgConn, upload_pk, user_pk) < PERM_WRITE) { LOG_ERROR("You have no update permissions on upload %ld", upload_pk); continue; } ars_pk = fo_WriteARS(pgConn, 0, upload_pk, agent_pk, AGENT_ARS, NULL, 0); sprintf(sql, fetch_pfile, upload_pk, agent_pk, agent_pk); pgResult = PQexec(pgConn, sql); num_files = PQntuples(pgResult); for(i = 0; i < num_files; i++) { c = atoi(PQgetvalue(pgResult, i, PQfnumber(pgResult, "pfile_pk"))); pair_set_first(curr, PQgetvalue(pgResult, i, PQfnumber(pgResult, "pfilename"))); pair_set_second(curr, &c); perform_analysis(pgConn, copy, curr, agent_pk, mout, REPORTALL); } fo_WriteARS(pgConn, ars_pk, upload_pk, agent_pk, AGENT_ARS, NULL, 1); PQclear(pgResult); } pair_destroy(curr); } if(db_connected) { fo_sqlCopyDestroy(sqlcpy, 1); PQfinish(pgConn); } if(verbose) { fclose(mout); } copyright_destroy(copy); fo_scheduler_disconnect(0); return 0; }
/** * @brief runs the labeled test files to determine accuracy * * This function will open each pair of files in the testdata directory to * analyze how accurate the copyright agent is. This function will respond with * the number of false negatives, false positives, and correct answers for each * file and total tally of these numbers. This will also produce 3 files, one * containing all matches that the copyright agent found, all the things that it * didn't find, and all of the false positives. */ void run_test_files(copyright copy) { /* locals */ cvector compare; copyright_iterator iter; cvector_iterator curr; FILE* istr, * m_out, * n_out, * p_out; char buffer[READMAX + 1]; char file_name[FILENAME_MAX]; char copy_buf[FILENAME_MAX]; char name_buf[FILENAME_MAX]; char* first, * last, * loc, tmp; int i, matches, correct = 0, falsep = 0, falsen = 0; /* grab the copyright files */ memset(copy_buf, '\0', sizeof(copy_buf)); memset(name_buf, '\0', sizeof(copy_buf)); snprintf(copy_buf, sizeof(copy_buf), "%s/mods-enabled/copyright/agent/copyright.dic", sysconfigdir); snprintf(name_buf, sizeof(name_buf), "%s/mods-enabled/copyright/agent/names.dic", sysconfigdir); /* create data structures */ copyright_init(©, copy_buf, name_buf); cvector_init(&compare, string_function_registry()); /* open the logging files */ m_out = fopen("Matches", "w"); n_out = fopen("False_Negatives", "w"); p_out = fopen("False_Positives", "w"); /* big problem if any of the log files didn't open correctly */ if(!m_out || !n_out || !p_out) { fprintf(cerr, "ERROR did not successfully open one of the log files\n"); fprintf(cerr, "ERROR the files that needed to be opened were:\n"); fprintf(cerr, "ERROR Matches, False_Positives, False_Negatives\n"); exit(-1); } /* loop over every file in the test directory */ for(i = 0; i < TESTFILE_NUMBER; i++) { sprintf(file_name, "%s%d_raw", test_dir, i); /* attempt to open the labeled test file */ istr = fopen(file_name, "r"); if(!istr) { fprintf(cerr, "ERROR Must run testing from correct directory. The\n"); fprintf(cerr, "ERROR correct directory is installation dependent but\n"); fprintf(cerr, "ERROR the working directory should include the folder:\n"); fprintf(cerr, "ERROR %s\n", test_dir); exit(-1); } /* initialize the buffer and read in any information */ memset(buffer, '\0', sizeof(buffer)); buffer[fread(buffer, sizeof(char), READMAX, istr)] = '\0'; matches = 0; /* set everything in the buffer to lower case */ for(first = buffer; *first; first++) { *first = tolower(*first); } /* loop through and find all <s>...</s> tags */ loc = buffer; while((first = strstr(loc, "<s>")) != NULL) { last = strstr(loc, "</s>"); if(last == NULL) { fprintf(cerr, "ERROR unmatched \"<s>\"\n"); fprintf(cerr, "ERROR in file: \"%s\"\n", file_name); exit(-1); } if(last <= first) { fprintf(cerr, "ERROR unmatched \"</s>\"\n"); fprintf(cerr, "ERROR in file: \"%s\"\n", file_name); exit(-1); } tmp = *last; *last = 0; cvector_push_back(compare, first + 3); *last = tmp; loc = last + 4; } /* close the previous file and open the corresponding raw data */ fclose(istr); file_name[strlen(file_name) - 4] = '\0'; istr = fopen(file_name, "r"); if(!istr) { fprintf(cerr, "ERROR Unmatched file in the test directory"); fprintf(cerr, "ERROR File with no match: \"%s\"_raw\n", file_name); fprintf(cerr, "ERROR File that caused error: \"%s\"\n", file_name); } /* perform the analysis on the current file */ copyright_analyze(copy, istr, REPORTALL); fclose(istr); /* loop over every match that the copyright object found */ for(iter = copyright_begin(copy); iter != copyright_end(copy); iter++) { cvector_iterator best = cvector_begin(compare); char score[2048]; char dst[2048]; memset(dst, '\0', sizeof(dst)); memset(score, '\0', sizeof(score)); /* log the coyright entry */ fprintf(m_out, "====%s================================\n", file_name); fprintf(m_out, "DICT: %s\tNAME: %s\n",copy_entry_dict(*iter), copy_entry_name(*iter)); fprintf(m_out, "TEXT[%s]\n",copy_entry_text(*iter)); /* loop over the vector looking for matches */ for(curr = cvector_begin(compare); curr != cvector_end(compare); curr++) { if(longest_common(dst, copy_entry_text(*iter), (char*)*curr) > strlen(score)) { strcpy(score, dst); best = curr; } } /* log the entry as found if it matched something in compare */ if(cvector_size(compare) != 0 && (strcmp(copy_entry_dict(*iter), "by") || strlen(score) > THRESHOLD)) { cvector_remove(compare, best); matches++; } else if(!strcmp(copy_entry_dict(*iter), "email") || !strcmp(copy_entry_dict(*iter), "url")) { matches++; } else { fprintf(p_out, "====%s================================\n", file_name); fprintf(p_out, "DICT: %s\tNAME: %s\n",copy_entry_dict(*iter), copy_entry_name(*iter)); fprintf(p_out, "TEXT[%s]\n",copy_entry_text(*iter)); } } /* log all the false negatives */ for(curr = cvector_begin(compare); curr != cvector_end(compare); curr++) { fprintf(n_out, "====%s================================\n", file_name); fprintf(n_out, "%s\n", (char*)*curr); } fprintf(cout, "====%s================================\n", file_name); fprintf(cout, "Correct: %d\n", matches); fprintf(cout, "False Positives: %d\n", copyright_size(copy) - matches); fprintf(cout, "False Negatives: %d\n", cvector_size(compare)); /* clean up for the next file */ correct += matches; falsep += copyright_size(copy) - matches; falsen += cvector_size(compare); cvector_clear(compare); } fprintf(cout, "==== Totals ================================\n"); fprintf(cout, "Total Found: %d\n", correct + falsep); fprintf(cout, "Correct: %d\n", correct); fprintf(cout, "False Positives: %d\n", falsep); fprintf(cout, "False Negatives: %d\n", falsen); fclose(m_out); fclose(n_out); fclose(p_out); copyright_destroy(copy); cvector_destroy(compare); }