Exemplo n.º 1
0
void test_string_function_registry()
{
  function_registry* fr = string_function_registry();
  char* tester = "hello";

  /* start the tests */
  printf("Test string_function_registry: ");
  CU_ASSERT_TRUE(!strcmp(fr->name, "string"));

  /* test the copy function */
  char* cpy = (char*)fr->copy(tester);
  CU_ASSERT_TRUE(!strcmp(cpy, tester));

  /* test to be sure that it is actually a copy */
  tester = "world";
  CU_ASSERT_FALSE(!strcmp(cpy, tester));

  /* free memory */
  fr->destroy(cpy);
  free(fr);

  /* finish the test */
  test_failure();
  printf("\n");
}
Exemplo n.º 2
0
/**
 * @brief main function for the copyright agent
 *
 * The copyright agent is used to automatically locate copyright statements
 * found in code.
 *
 * There are 3 ways to use the copyright agent:
 *   1. Command Line Analysis :: test a file from the command line
 *   2. Agent Based Analysis  :: waits for commands from stdin
 *   3. Accuracy Test         :: tests the accuracy of the copyright agent
 *
 * +-----------------------+
 * | Command Line Analysis |
 * +-----------------------+
 *
 * To analyze a file from the command line:
 *   -C <filename>      :: run copyright agent from command line
 *   -d                 :: turn on debugging information
 *   -T <Copyright Statements | URLs| Emails> :: Copyright Statements | URLs |Emails
 *
 *   example:
 *     $ ./copyright -C myfiletoscan
 *
 * +----------------------+
 * | Agent Based Analysis |
 * +----------------------+
 *
 * To run the copyright agent as an agent simply run with no command line args
 *   -i                 :: initialize a connection to the database
 *   -d                 :: turn on debugging information
 *
 *   example:
 *     $ upload_pk | ./copyright
 *
 * +---------------+
 * | Accuracy Test |
 * +---------------+
 *
 * To test the accuracy of the copyright agent run with a -t. Make sure to run the
 * accuracy tests in the source directory with the testdata directory:
 *   -t                 :: run the accuracy analysis
 *
 *   example:
 *     $ ./copyright -t
 *
 * Running the tests will create 3 files:
 * 1. Matches: contains all of the matches found by the copyright agent, information
 *             includes what file the match was found in, the dictionary element
 *             that it matched, the name that it matched and the text that was found
 * 2. False_Positives: contains all of the false positives found by the agent,
 *             information in the file includes the file the false positive was
 *             in, the dictionary match, the name match, and the text
 * 3. Flase_Negatives: contains all of the false negatives found by the agent,
 *             information in the file includes the file the false negative was
 *             in, and the text of the false negative
 *
 * NOTE: -d will produces the exact same style of Matches file that the accuracy
 *       testing does. Currently this is the only thing that -d will produce
 *
 * @param argc the number of command line arguments
 * @param argv the command line arguments
 * @return 0 on a successful program execution
 */
int main(int argc, char** argv)
{
  /* primitives */
  char sql[512];                // buffer for database access
  int c, i = -1;                // temporary int containers
  int num_files = 0;            // the number of rows in a job
  int ars_pk = 0;               // the args primary key
  int user_pk = 0;
  long upload_pk = 0;           // the upload primary key
  long agent_pk = 0;            // the agents primary key
  char *SVN_REV = NULL;
  char *VERSION = NULL;
  char agent_rev[myBUFSIZ];
  char copy_buf[FILENAME_MAX];
  char name_buf[FILENAME_MAX];
  int report_type = 7;         // defaul as all. binary xxx 1st number as email, 2nd number as url, 3rd number as statement 
  int cli_run = 0;               // when run from command line, that mean -C option is set; 1: yes, 0: no

  /* Database structs */
  PGconn* pgConn = NULL;        // the connection to Database
  PGresult* pgResult = NULL;    // result of a database access

  /* copyright structs */
  copyright copy;               // the work horse of the copyright agent
  pair curr;                    // pair to push into the file list

  /* verbose data */
  FILE* mout = NULL;

  /* set the output streams */
  cout = stdout;
  cerr = stdout;
  cin = stdin;

  /* connect to the scheduler */
  fo_scheduler_connect(&argc, argv, &pgConn);

  /* initialize complex data strcutres */
  memset(copy_buf, '\0', sizeof(copy_buf));
  memset(name_buf, '\0', sizeof(copy_buf));
  snprintf(copy_buf, sizeof(copy_buf),
      "%s/mods-enabled/copyright/agent/copyright.dic",
      sysconfigdir);
  snprintf(name_buf, sizeof(name_buf),
      "%s/mods-enabled/copyright/agent/names.dic",
      sysconfigdir);

  if(!copyright_init(&copy, copy_buf, name_buf))
  {
    fprintf(cerr, "FATAL %s.%d: copyright initialization failed\n", __FILE__, __LINE__);
    fprintf(cerr, "FATAL %s\n", strerror(errno));
    fflush(cerr);
    return 1;
  }

  /* parse the command line options */
  while((c = getopt(argc, argv, "T:dc:C:tiVvh")) != -1)
  {
    switch(c)
    {
      case 'v': /* debugging */
        mout = fopen("Matches", "w");
        if(!mout)
        {
          fprintf(cerr, "ERROR could not open Matches for logging\n");
          fflush(cerr);
        }
        else
        {
          verbose = 1;
        }
        break;
      case 'C': /* run from command line */
        cli_run = 1;
        pair_init(&curr, string_function_registry(), int_function_registry());

        pair_set_first(curr, optarg);
        pair_set_second(curr, &i);
        num_files++;

        break;
      case 'T': /* report type, Copyright Statements | URLs| Emails */
        report_type = atoi(optarg);
        printf("report_type is:%d\n", report_type);
        break;
      case 't': /* run accuracy testing */
        run_test_files(copy);
        copyright_destroy(copy);
        return 0;
      case 'i': /* initialize database connections */
        copyright_destroy(copy);
        PQfinish(pgConn);
        return 0;
      case 'V':
        printf("%s", BuildVersion);
        copyright_destroy(copy);
        PQfinish(pgConn);
	return 0;
      default: /* error, print usage */
        copyright_usage(argv[0]);
        return 3;
    }
  }

  /** run from command line */
  if (1 == cli_run) {
    perform_analysis(pgConn, copy, curr, agent_pk, mout, report_type);
    pair_destroy(curr);
  }

  /* if there are no files in the file list then the agent is begin run from */
  /* the scheduler, open the database and grab the files to be analyzed      */
  if(num_files == 0)
  {
    /* create the sql copy structure */
    sqlcpy = fo_sqlCopyCreate(pgConn, "copyright", 32768, 7,
        "agent_fk", "pfile_fk", "copy_startbyte", "copy_endbyte", "content", "hash", "type");

    /* book keeping */
    pair_init(&curr, string_function_registry(), int_function_registry());
    db_connected = 1;
    SVN_REV = fo_sysconfig("copyright", "SVN_REV");
    VERSION = fo_sysconfig("copyright", "VERSION");
    sprintf(agent_rev, "%s.%s", VERSION, SVN_REV);
    agent_pk = fo_GetAgentKey(pgConn, AGENT_NAME, 0, agent_rev, AGENT_DESC);

    /* make sure that we are connected to the database */
    if(!check_copyright_table(pgConn))
    {
      return 5;
    }

    user_pk = fo_scheduler_userID(); /* get user_pk for user who queued the agent */

    /* enter the main agent loop */
    while(fo_scheduler_next())
    {
      upload_pk = atol(fo_scheduler_current());

      /* Check Permissions */
      if (GetUploadPerm(pgConn, upload_pk, user_pk) < PERM_WRITE)
      {
        LOG_ERROR("You have no update permissions on upload %ld", upload_pk);
        continue;
      }
                                               
      ars_pk = fo_WriteARS(pgConn, 0, upload_pk, agent_pk, AGENT_ARS, NULL, 0);

      sprintf(sql, fetch_pfile, upload_pk, agent_pk, agent_pk);
      pgResult = PQexec(pgConn, sql);
      num_files = PQntuples(pgResult);

      for(i = 0; i < num_files; i++)
      {
        c = atoi(PQgetvalue(pgResult, i, PQfnumber(pgResult, "pfile_pk")));
        pair_set_first(curr, PQgetvalue(pgResult, i, PQfnumber(pgResult, "pfilename")));
        pair_set_second(curr, &c);
        perform_analysis(pgConn, copy, curr, agent_pk, mout, REPORTALL);
      }

      fo_WriteARS(pgConn, ars_pk, upload_pk, agent_pk, AGENT_ARS, NULL, 1);
      PQclear(pgResult);
    }

    pair_destroy(curr);
  }

  if(db_connected)
  {
    fo_sqlCopyDestroy(sqlcpy, 1);
    PQfinish(pgConn);
  }

  if(verbose)
  {
    fclose(mout);
  }

  copyright_destroy(copy);
  fo_scheduler_disconnect(0);

  return 0;
}
Exemplo n.º 3
0
/**
 * @brief runs the labeled test files to determine accuracy
 *
 * This function will open each pair of files in the testdata directory to
 * analyze how accurate the copyright agent is. This function will respond with
 * the number of false negatives, false positives, and correct answers for each
 * file and total tally of these numbers. This will also produce 3 files, one
 * containing all matches that the copyright agent found, all the things that it
 * didn't find, and all of the false positives.
 */
void run_test_files(copyright copy)
{
  /* locals */
  cvector compare;
  copyright_iterator iter;
  cvector_iterator curr;
  FILE* istr, * m_out, * n_out, * p_out;
  char buffer[READMAX + 1];
  char file_name[FILENAME_MAX];
  char copy_buf[FILENAME_MAX];
  char name_buf[FILENAME_MAX];
  char* first, * last, * loc, tmp;
  int i, matches, correct = 0, falsep = 0, falsen = 0;

  /* grab the copyright files */
  memset(copy_buf, '\0', sizeof(copy_buf));
  memset(name_buf, '\0', sizeof(copy_buf));
  snprintf(copy_buf, sizeof(copy_buf),
      "%s/mods-enabled/copyright/agent/copyright.dic",
      sysconfigdir);
  snprintf(name_buf, sizeof(name_buf),
      "%s/mods-enabled/copyright/agent/names.dic",
      sysconfigdir);

  /* create data structures */
  copyright_init(&copy, copy_buf, name_buf);
  cvector_init(&compare, string_function_registry());

  /* open the logging files */
  m_out = fopen("Matches", "w");
  n_out = fopen("False_Negatives", "w");
  p_out = fopen("False_Positives", "w");

  /* big problem if any of the log files didn't open correctly */
  if(!m_out || !n_out || !p_out)
  {
    fprintf(cerr, "ERROR did not successfully open one of the log files\n");
    fprintf(cerr, "ERROR the files that needed to be opened were:\n");
    fprintf(cerr, "ERROR Matches, False_Positives, False_Negatives\n");
    exit(-1);
  }

  /* loop over every file in the test directory */
  for(i = 0; i < TESTFILE_NUMBER; i++)
  {
    sprintf(file_name, "%s%d_raw", test_dir, i);

    /* attempt to open the labeled test file */
    istr = fopen(file_name, "r");
    if(!istr)
    {
      fprintf(cerr, "ERROR Must run testing from correct directory. The\n");
      fprintf(cerr, "ERROR correct directory is installation dependent but\n");
      fprintf(cerr, "ERROR the working directory should include the folder:\n");
      fprintf(cerr, "ERROR   %s\n", test_dir);
      exit(-1);
    }

    /* initialize the buffer and read in any information */
    memset(buffer, '\0', sizeof(buffer));
    buffer[fread(buffer, sizeof(char), READMAX, istr)] = '\0';
    matches = 0;

    /* set everything in the buffer to lower case */
    for(first = buffer; *first; first++)
    {
      *first = tolower(*first);
    }

    /* loop through and find all <s>...</s> tags */
    loc = buffer;
    while((first = strstr(loc, "<s>")) != NULL)
    {
      last = strstr(loc, "</s>");

      if(last == NULL)
      {
        fprintf(cerr, "ERROR unmatched \"<s>\"\n");
        fprintf(cerr, "ERROR in file: \"%s\"\n", file_name);
        exit(-1);
      }

      if(last <= first)
      {
        fprintf(cerr, "ERROR unmatched \"</s>\"\n");
        fprintf(cerr, "ERROR in file: \"%s\"\n", file_name);
        exit(-1);
      }

      tmp = *last;
      *last = 0;
      cvector_push_back(compare, first + 3);
      *last = tmp;
      loc = last + 4;
    }

    /* close the previous file and open the corresponding raw data */
    fclose(istr);
    file_name[strlen(file_name) - 4] = '\0';
    istr = fopen(file_name, "r");
    if(!istr)
    {
      fprintf(cerr, "ERROR Unmatched file in the test directory");
      fprintf(cerr, "ERROR File with no match: \"%s\"_raw\n", file_name);
      fprintf(cerr, "ERROR File that caused error: \"%s\"\n", file_name);
    }

    /* perform the analysis on the current file */
    copyright_analyze(copy, istr, REPORTALL);
    fclose(istr);

    /* loop over every match that the copyright object found */
    for(iter = copyright_begin(copy); iter != copyright_end(copy); iter++)
    {
      cvector_iterator best = cvector_begin(compare);
      char score[2048];
      char dst[2048];

      memset(dst, '\0', sizeof(dst));
      memset(score, '\0', sizeof(score));

      /* log the coyright entry */
      fprintf(m_out, "====%s================================\n", file_name);
      fprintf(m_out, "DICT: %s\tNAME: %s\n",copy_entry_dict(*iter), copy_entry_name(*iter));
      fprintf(m_out, "TEXT[%s]\n",copy_entry_text(*iter));

      /* loop over the vector looking for matches */
      for(curr = cvector_begin(compare); curr != cvector_end(compare); curr++)
      {
        if(longest_common(dst, copy_entry_text(*iter), (char*)*curr) > strlen(score))
        {
          strcpy(score, dst);
          best = curr;
        }
      }

      /* log the entry as found if it matched something in compare */
      if(cvector_size(compare) != 0 &&
          (strcmp(copy_entry_dict(*iter), "by") || strlen(score) > THRESHOLD))
      {
        cvector_remove(compare, best);
        matches++;
      }
      else if(!strcmp(copy_entry_dict(*iter), "email") || !strcmp(copy_entry_dict(*iter), "url"))
      {
        matches++;
      }
      else
      {
        fprintf(p_out, "====%s================================\n", file_name);
        fprintf(p_out, "DICT: %s\tNAME: %s\n",copy_entry_dict(*iter), copy_entry_name(*iter));
        fprintf(p_out, "TEXT[%s]\n",copy_entry_text(*iter));
      }
    }

    /* log all the false negatives */
    for(curr = cvector_begin(compare); curr != cvector_end(compare); curr++)
    {
      fprintf(n_out, "====%s================================\n", file_name);
      fprintf(n_out, "%s\n", (char*)*curr);
    }

    fprintf(cout, "====%s================================\n", file_name);
    fprintf(cout, "Correct:         %d\n", matches);
    fprintf(cout, "False Positives: %d\n", copyright_size(copy) - matches);
    fprintf(cout, "False Negatives: %d\n", cvector_size(compare));

    /* clean up for the next file */
    correct += matches;
    falsep += copyright_size(copy) - matches;
    falsen += cvector_size(compare);
    cvector_clear(compare);
  }

  fprintf(cout, "==== Totals ================================\n");
  fprintf(cout, "Total Found:     %d\n", correct + falsep);
  fprintf(cout, "Correct:         %d\n", correct);
  fprintf(cout, "False Positives: %d\n", falsep);
  fprintf(cout, "False Negatives: %d\n", falsen);

  fclose(m_out);
  fclose(n_out);
  fclose(p_out);
  copyright_destroy(copy);
  cvector_destroy(compare);
}