Example #1
 * @brief main function for the copyright agent
 * The copyright agent is used to automatically locate copyright statements
 * found in code.
 * There are 3 ways to use the copyright agent:
 *   1. Command Line Analysis :: test a file from the command line
 *   2. Agent Based Analysis  :: waits for commands from stdin
 *   3. Accuracy Test         :: tests the accuracy of the copyright agent
 * +-----------------------+
 * | Command Line Analysis |
 * +-----------------------+
 * To analyze a file from the command line:
 *   -C <filename>      :: run copyright agent from command line
 *   -d                 :: turn on debugging information
 *   -T <Copyright Statements | URLs| Emails> :: Copyright Statements | URLs |Emails
 *   example:
 *     $ ./copyright -C myfiletoscan
 * +----------------------+
 * | Agent Based Analysis |
 * +----------------------+
 * To run the copyright agent as an agent simply run with no command line args
 *   -i                 :: initialize a connection to the database
 *   -d                 :: turn on debugging information
 *   example:
 *     $ upload_pk | ./copyright
 * +---------------+
 * | Accuracy Test |
 * +---------------+
 * To test the accuracy of the copyright agent run with a -t. Make sure to run the
 * accuracy tests in the source directory with the testdata directory:
 *   -t                 :: run the accuracy analysis
 *   example:
 *     $ ./copyright -t
 * Running the tests will create 3 files:
 * 1. Matches: contains all of the matches found by the copyright agent, information
 *             includes what file the match was found in, the dictionary element
 *             that it matched, the name that it matched and the text that was found
 * 2. False_Positives: contains all of the false positives found by the agent,
 *             information in the file includes the file the false positive was
 *             in, the dictionary match, the name match, and the text
 * 3. Flase_Negatives: contains all of the false negatives found by the agent,
 *             information in the file includes the file the false negative was
 *             in, and the text of the false negative
 * NOTE: -d will produces the exact same style of Matches file that the accuracy
 *       testing does. Currently this is the only thing that -d will produce
 * @param argc the number of command line arguments
 * @param argv the command line arguments
 * @return 0 on a successful program execution
int main(int argc, char** argv)
  /* primitives */
  char sql[512];                // buffer for database access
  int c, i = -1;                // temporary int containers
  int num_files = 0;            // the number of rows in a job
  int ars_pk = 0;               // the args primary key
  int user_pk = 0;
  long upload_pk = 0;           // the upload primary key
  long agent_pk = 0;            // the agents primary key
  char *SVN_REV = NULL;
  char *VERSION = NULL;
  char agent_rev[myBUFSIZ];
  char copy_buf[FILENAME_MAX];
  char name_buf[FILENAME_MAX];
  int report_type = 7;         // defaul as all. binary xxx 1st number as email, 2nd number as url, 3rd number as statement 
  int cli_run = 0;               // when run from command line, that mean -C option is set; 1: yes, 0: no

  /* Database structs */
  PGconn* pgConn = NULL;        // the connection to Database
  PGresult* pgResult = NULL;    // result of a database access

  /* copyright structs */
  copyright copy;               // the work horse of the copyright agent
  pair curr;                    // pair to push into the file list

  /* verbose data */
  FILE* mout = NULL;

  /* set the output streams */
  cout = stdout;
  cerr = stdout;
  cin = stdin;

  /* connect to the scheduler */
  fo_scheduler_connect(&argc, argv, &pgConn);

  /* initialize complex data strcutres */
  memset(copy_buf, '\0', sizeof(copy_buf));
  memset(name_buf, '\0', sizeof(copy_buf));
  snprintf(copy_buf, sizeof(copy_buf),
  snprintf(name_buf, sizeof(name_buf),

  if(!copyright_init(&copy, copy_buf, name_buf))
    fprintf(cerr, "FATAL %s.%d: copyright initialization failed\n", __FILE__, __LINE__);
    fprintf(cerr, "FATAL %s\n", strerror(errno));
    return 1;

  /* parse the command line options */
  while((c = getopt(argc, argv, "T:dc:C:tiVvh")) != -1)
      case 'v': /* debugging */
        mout = fopen("Matches", "w");
          fprintf(cerr, "ERROR could not open Matches for logging\n");
          verbose = 1;
      case 'C': /* run from command line */
        cli_run = 1;
        pair_init(&curr, string_function_registry(), int_function_registry());

        pair_set_first(curr, optarg);
        pair_set_second(curr, &i);

      case 'T': /* report type, Copyright Statements | URLs| Emails */
        report_type = atoi(optarg);
        printf("report_type is:%d\n", report_type);
      case 't': /* run accuracy testing */
        return 0;
      case 'i': /* initialize database connections */
        return 0;
      case 'V':
        printf("%s", BuildVersion);
	return 0;
      default: /* error, print usage */
        return 3;

  /** run from command line */
  if (1 == cli_run) {
    perform_analysis(pgConn, copy, curr, agent_pk, mout, report_type);

  /* if there are no files in the file list then the agent is begin run from */
  /* the scheduler, open the database and grab the files to be analyzed      */
  if(num_files == 0)
    /* create the sql copy structure */
    sqlcpy = fo_sqlCopyCreate(pgConn, "copyright", 32768, 7,
        "agent_fk", "pfile_fk", "copy_startbyte", "copy_endbyte", "content", "hash", "type");

    /* book keeping */
    pair_init(&curr, string_function_registry(), int_function_registry());
    db_connected = 1;
    SVN_REV = fo_sysconfig("copyright", "SVN_REV");
    VERSION = fo_sysconfig("copyright", "VERSION");
    sprintf(agent_rev, "%s.%s", VERSION, SVN_REV);
    agent_pk = fo_GetAgentKey(pgConn, AGENT_NAME, 0, agent_rev, AGENT_DESC);

    /* make sure that we are connected to the database */
      return 5;

    user_pk = fo_scheduler_userID(); /* get user_pk for user who queued the agent */

    /* enter the main agent loop */
      upload_pk = atol(fo_scheduler_current());

      /* Check Permissions */
      if (GetUploadPerm(pgConn, upload_pk, user_pk) < PERM_WRITE)
        LOG_ERROR("You have no update permissions on upload %ld", upload_pk);
      ars_pk = fo_WriteARS(pgConn, 0, upload_pk, agent_pk, AGENT_ARS, NULL, 0);

      sprintf(sql, fetch_pfile, upload_pk, agent_pk, agent_pk);
      pgResult = PQexec(pgConn, sql);
      num_files = PQntuples(pgResult);

      for(i = 0; i < num_files; i++)
        c = atoi(PQgetvalue(pgResult, i, PQfnumber(pgResult, "pfile_pk")));
        pair_set_first(curr, PQgetvalue(pgResult, i, PQfnumber(pgResult, "pfilename")));
        pair_set_second(curr, &c);
        perform_analysis(pgConn, copy, curr, agent_pk, mout, REPORTALL);

      fo_WriteARS(pgConn, ars_pk, upload_pk, agent_pk, AGENT_ARS, NULL, 1);


    fo_sqlCopyDestroy(sqlcpy, 1);



  return 0;
Example #2
int main(int argc, char** argv)
	po::variables_map vm;
	std::string extraction_directory;
	std::vector<std::string> selected_plugins, selected_categories;

	// Load the dynamic plugins.
	bfs::path working_dir(argv[0]);
	working_dir = working_dir.parent_path();

	// Load the configuration
	config conf = parse_config((working_dir / "manalyze.conf").string());

	if (!parse_args(vm, argc, argv)) {
		return -1;

	// Get all the paths now and make them absolute before changing the working directory
	std::set<std::string> targets = get_input_files(vm);
	if (vm.count("extract")) {
		extraction_directory = bfs::absolute(vm["extract"].as<std::string>()).string();
	// Break complex arguments into a list once and for all.
	if (vm.count("plugins")) {
		selected_plugins = tokenize_args(vm["plugins"].as<std::vector<std::string> >());
	if (vm.count("dump")) {
		selected_categories = tokenize_args(vm["dump"].as<std::vector<std::string> >());

	// Instantiate the requested OutputFormatter
	boost::shared_ptr<io::OutputFormatter> formatter;
	if (vm.count("output") && vm["output"].as<std::string>() == "json") {
		formatter.reset(new io::JsonFormatter());
	else // Default: use the human-readable output.
		formatter.reset(new io::RawFormatter());
		formatter->set_header("* Manalyze " MANALYZE_VERSION " *");

	// Set the working directory to Manalyze's folder.

	// Do the actual analysis on all the input files
	unsigned int count = 0;
	for (auto it = targets.begin() ; it != targets.end() ; ++it)
		perform_analysis(*it, vm, extraction_directory, selected_categories, selected_plugins, conf, formatter);
		if (++count % 1000 == 0) {
			formatter->format(std::cout, false); // Flush the formatter from time to time, to avoid eating up all the RAM when analyzing gigs of files.


	if (vm.count("plugins"))
		// Explicitly unload the plugins

	return 0;