Beispiel #1
0
const int usage_train()
{
	print("Usage: salad train [options]\n"
	"\n"
	"I/O options:\n"
	"  -i,  --input <file>         The input filename.\n"
	"  -f,  --input-format <fmt>   Sets the format of input. This option might be \n"
	"                              one of " IOMODES ".\n"
#ifdef USE_REGEX_FILTER
	"       --input-filter <regex> The regular expression for filtering input lines\n"
	"                              or filenames respectively.\n"
#endif
	"       --batch-size <num>     Set the size of batches that are read and \n"
	"                              processed in one go (Default: %"ZU").\n"
#ifdef USE_NETWORK
	"  -p,  --pcap-filter <str>    Filter expression for the PCAP library in case\n"
	"                              network data is processed (Default: %s).\n"
	"       --client-only          Only consider the client-side of the network\n"
	"                              communication.\n"
	"       --server-only          Only consider the server-side of the network\n"
	"                              communication.\n"
#endif
	"  -u,  --update-model         In case the specified output file exists and\n"
	"                              contains a valid model this flag indicates\n"
	"                              that that model should be update rather than\n"
	"                              recreated from scratch.\n"
	"  -o,  --output <file>        The output filename.\n"
#ifdef USE_ARCHIVES
	// If there is no libarchive support we can only make use of text-based configurations.
	"  -F,  --output-format <fmt>  Sets the format of output. This option might be \n"
	"                              one of " SALAD_OUTPUTFMTS ".\n"
#endif
	"\n"
	"Feature options:\n"
	"  -n,  --ngram-len <num>      Set length of n-grams (Default: %"ZU").\n"
	"  -d,  --ngram-delim <delim>  Set delimiters for the use of word/ token n-grams.\n"
	"                              If omitted or empty byte n-grams are used.\n"
	"       --binary               Indicates to use bit n-grams rather than byte\n"
	"                              or token n-grams and consequently, disables the\n"
	"                              --ngram-delim option.\n"
	"  -s,  --filter-size <num>    Set the size of the bloom filter as bits of\n"
	"                              the index (Default: %u).\n"
	"       --hash-set <hashes>    Set the hash set to be used: 'simple', 'simple2'\n"
	"                              or 'murmur' (Default: '%s').\n"
	"\n"
	"Generic options:\n"
	"  -e,  --echo-params          Echo used parameters and settings.\n"
	"  -q,  --quiet                Suppress all output but warning and errors.\n"
	"  -h,  --help                 Print this help screen.\n",
	/* --batch-size  */ (SIZE_T) DEFAULT_CONFIG.batch_size,
#ifdef USE_NETWORK
	/* --pcap-filter */ DEFAULT_CONFIG.pcap_filter,
#endif
	/* --ngram-len   */ (SIZE_T) DEFAULT_CONFIG.ngram_length,
	/* --filter-size */ DEFAULT_CONFIG.filter_size,
	/* --hash-set    */ hashset_to_string(DEFAULT_CONFIG.hash_set));
	return EXIT_SUCCESS;
}
Beispiel #2
0
const int usage_inspect()
{
	print("Usage: salad inspect [options]\n"
	"\n"
	"I/O options:\n"
	"  -i,  --input <file>         The input filename.\n"
	"  -f,  --input-format <fmt>   Sets the format of input. This option might be \n"
	"                              one of " IOMODES ".\n"
#ifdef USE_REGEX_FILTER
	"       --input-filter <regex> The regular expression for filtering input lines\n"
	"                              or filenames respectively.\n"
#endif
	"       --batch-size <num>     Set the size of batches that are read and \n"
	"                              processed in one go (Default: %"ZU").\n"
#ifdef USE_NETWORK
	"  -p,  --pcap-filter <str>    Filter expression for the PCAP library in case\n"
	"                              network data is processed (Default: %s).\n"
	"       --client-only          Only consider the client-side of the network\n"
	"                              communication.\n"
	"       --server-only          Only consider the server-side of the network\n"
	"                              communication.\n"
#endif
	"  -b,  --bloom <file>         The bloom filter to be used.\n"
	"  -o,  --output <file>        The output filename.\n"
	"\n"
	"Feature options:\n"
	"  -n,  --ngram-len <num>      Set length of n-grams (Default: %"ZU").\n"
	"  -d,  --ngram-delim <delim>  Set delimiters for the use of word/ token n-grams.\n"
	"                              If omitted or empty byte n-grams are used.\n"
	"       --binary               Indicates to use bit n-grams rather than byte\n"
	"                              or token n-grams and consequently, disables the\n"
	"                              --ngram-delim option.\n"
	"  -s,  --filter-size <num>    Set the size of the bloom filter as bits of\n"
	"                              the index (Default: %u).\n"
	"       --hash-set <hashes>    Set the hash set to be used: 'simple', 'simple2'\n"
	"                              or 'murmur' (Default: '%s').\n"
	"\n"
	"Generic options:\n"
	"  -e,  --echo-params          Echo used parameters and settings.\n"
	"  -h,  --help                 Print this help screen.\n",
	/* --batch-size  */ (SIZE_T) DEFAULT_CONFIG.batch_size,
#ifdef USE_NETWORK
	/* --pcap-filter */ DEFAULT_CONFIG.pcap_filter,
#endif
	/* --ngram-len   */ (SIZE_T) DEFAULT_CONFIG.ngram_length,
	/* --filter-size */ DEFAULT_CONFIG.filter_size,
	/* --hash-set    */ hashset_to_string(DEFAULT_CONFIG.hash_set));
	return EXIT_SUCCESS;
}
Beispiel #3
0
const saladstate_t parse_traininglike_options_ex(int argc, char* argv[], config_t* const config,
		const char *shortopts, const struct option *longopts)
{
	assert(argv != NULL);
	assert(config != NULL);

	int option, bs = FALSE, fo = FALSE;
	while ((option = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1)
	{
		switch (option)
		{
		case 'i':
			config->input = optarg;
			break;

		case 'f':
			config->input_type = as_iomode(optarg);
			break;

		case OPTION_INPUTFILTER:
			config->input_filter = optarg;
			break;

		case OPTION_BATCHSIZE:
		{
			int batch_size = atoi(optarg); // TODO: strtol
			if (batch_size <= 0)
			{
				warn("Illegal batch size specified.\n");
				// This is not true in case of network data as input. Therefore,
				// we simply suppress this output at this point.
				// warn("Defaulting to: %u\n", (unsigned int) config->batch_size);
			}
			else
			{
				bs = TRUE;
				config->batch_size = batch_size;
			}
			break;
		}
#ifdef USE_NETWORK
		case 'p':
			config->pcap_filter = optarg;
			break;
#endif
		case 'b':
			config->bloom = optarg;
			break;

		case 'u':
			config->update_model = TRUE;
			break;

		case 'o':
			config->output = optarg;
			break;

		case 'n':
		{
			fo = TRUE;
			int ngramLength = atoi(optarg); // TODO: strtol
			if (ngramLength <= 0)
			{
				warn("Illegal n-gram length specified.");
				warn("Defaulting to: %u\n", (unsigned int) config->ngramLength);
			}
			else config->ngramLength = ngramLength;
			break;
		}
		case 'd':
			fo = TRUE;
			config->delimiter = optarg;
			break;

		case 's':
		{
			fo = TRUE;
			int filter_size = atoi(optarg); // TODO: strtol
			if (filter_size <= 0)
			{
				warn("Illegal filter size specified.");
				warn("Defaulting to: %u\n", (unsigned int) config->filter_size);
			}
			else config->filter_size = filter_size;
			break;
		}
		case OPTION_HASHSET:
		{
			fo = TRUE;
			hashset_t hashset = to_hashset(optarg);
			if (hashset == HASHES_UNDEFINED)
			{
				warn("Illegal hash set specified.");
				warn("Defaulting to: %s\n", hashset_to_string(config->hash_set));
			}
			else config->hash_set = hashset;
			break;
		}
		case 'e':
			config->echo_params = TRUE;
			break;

		case '?':
		case 'h':
			return SALAD_HELP_TRAIN;

		default:
			// In order to catch program argument that correspond to
			// features that were excluded at compile time.
			fprintf(stderr, "invalid option -- '%c'\n", option);
			return SALAD_HELP_TRAIN;
		}
	}

	config->transfer_spec = !fo;

	if (check_input(config, TRUE, bs) == EXIT_FAILURE) return SALAD_EXIT;
	if (check_output(config) == EXIT_FAILURE) return SALAD_EXIT;

	if (config->echo_params)
	{
		if (config->update_model && config->transfer_spec) {
			// cf. salad_train_stub
		} else {
			echo_options(config);
		}
	}
	return SALAD_RUN;
}
Beispiel #4
0
const saladstate_t parse_traininglike_options_ex(int argc, char* argv[], config_t* const config,
		const char *shortopts, const struct option *longopts)
{
	assert(argv != NULL);
	assert(config != NULL);

	char* end; // For parsing numbers with strto*
	int conly = FALSE, sonly = FALSE;

	int option, bs = FALSE, fo = FALSE;
	while ((option = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1)
	{
		switch (option)
		{
		case 'i':
			config->input = optarg;
			break;

		case 'f':
			config->input_type = as_inputmode(optarg);
			break;

		case OPTION_INPUTFILTER:
			config->input_filter = optarg;
			break;

		case OPTION_BATCHSIZE:
		{
			const long long int batch_size = strtoll(optarg, &end, 10);
			if (batch_size <= 0)
			{
				warn("Illegal batch size specified.\n");
				// This is not true in case of network data as input. Therefore,
				// we simply suppress this output at this point.
				// warn("Defaulting to: %u\n", (unsigned int) config->batch_size);
			}
			else
			{
				bs = TRUE;
				config->batch_size = (size_t) MIN(SIZE_MAX, (unsigned long) MAX(0, batch_size));
			}
			break;
		}

#ifdef USE_NETWORK
		case 'p':
			config->pcap_filter = optarg;
			break;

		case OPTION_NETCLIENT:
			conly = TRUE;
			break;

		case OPTION_NETSERVER:
			sonly = TRUE;
			break;
#endif
		case 'b':
			config->bloom = optarg;
			break;

		case 'u':
			config->update_model = TRUE;
			break;

		case 'o':
			config->output = optarg;
			break;

		case 'F':
			config->output_type = as_outputmode(optarg);
			break;

		case 'n':
		{
			fo = TRUE;
			const long long int ngram_length = strtoll(optarg, &end, 10);
			if (ngram_length <= 0)
			{
				warn("Illegal n-gram length specified.");
				warn("Defaulting to: %"ZU"\n", (SIZE_T) config->ngram_length);
			}
			else config->ngram_length = (size_t) MIN(SIZE_MAX, (unsigned long) ngram_length);
			break;
		}
		case 'd':
			fo = TRUE;
			config->delimiter = optarg;
			break;

		case OPTION_BINARY:
			config->binary_ngrams = TRUE;
			break;

		case 's':
		{
			fo = TRUE;
			const long long int filter_size = strtoll(optarg, &end, 10);
			if (filter_size <= 0)
			{
				warn("Illegal filter size specified.");
				warn("Defaulting to: %u\n", (unsigned int) config->filter_size);
			}
			else config->filter_size = (unsigned int) MIN(UINT_MAX, (unsigned long) MAX(0, filter_size));
			break;
		}
		case OPTION_HASHSET:
		{
			fo = TRUE;
			hashset_t hashset = to_hashset(optarg);
			if (hashset == HASHES_UNDEFINED)
			{
				warn("Illegal hash set specified.");
				warn("Defaulting to: %s\n", hashset_to_string(config->hash_set));
			}
			else config->hash_set = hashset;
			break;
		}
		case 'e':
			config->echo_params = TRUE;
			break;

		case 'q':
			log_level = WARNING;
			break;

		case '?':
		case 'h':
			log_level = STATUS;
			return SALAD_HELP_TRAIN;

		default:
			// In order to catch program argument that correspond to
			// features that were excluded at compile time.
			fprintf(stderr, "invalid option -- '%c'\n", option);
			return SALAD_HELP_TRAIN;
		}
	}

	config->transfer_spec = !fo;

	if (config->binary_ngrams && config->ngram_length > MASK_BITSIZE)
	{
		error("When using binary n-grams currently only a maximal");
		error("length of %u bits is supported.", MASK_BITSIZE);
		return SALAD_EXIT;
	}

	if (check_netparams(config, conly, sonly) == EXIT_FAILURE) return SALAD_HELP_TRAIN;
	if (check_input(config, TRUE, bs) == EXIT_FAILURE) return SALAD_EXIT;
	if (check_output(config) == EXIT_FAILURE) return SALAD_EXIT;

	if (config->echo_params)
	{
		if (config->update_model && config->transfer_spec) {
			// cf. salad_train_stub
		} else {
			echo_options(config);
		}
	}
	return SALAD_RUN;
}