Exemplo n.º 1
0
/*
 * Return the starting position of the tld in host or -1 if not found
 * Require a TLD Tree.
 * Ex: 
 * - google.com => 7 (='com')
 * - abc.co.uk  => 4 (='co.uk')
 * - google.nawak => -1 (NOT FOUND)
 *
 */
faup_tld_tree_extracted_t faup_tld_tree_extract(TLDNode *tld_tree, const char *org_str, faup_feature_t host)
{
	const char *p;
	char *last;
	bool found;
	int step;
	faup_tld_tree_extracted_t tld_extracted;
 
	tld_extracted.pos = -1;
	tld_extracted.size = 0;

	last = NULL;
	p    = org_str + host.pos + host.size - 1;
	while( *p )
	{
		while( *(p-1) && (*p != '.') )
			p--;

		step = ( *p == '.' ) ? 1 : 0;

		found = faup_tld_tree_tld_exists(tld_tree->kid, p + step);
		if( ! found )
			break;
		last = (char *) p + step;
		p--;
	}

	if( last == NULL )
		return tld_extracted;

	// here we have the longest TLD
	// but is that an exception ? (ex: !siemens.om vs *.om)
	found = faup_tld_tree_tld_exists(tld_tree->sibling, last);

	if( found )
	{
		while( *(last) != '.' )
			last++;
		last++;
	}

	tld_extracted.size = strlen(last);
	tld_extracted.pos = host.pos + host.size - tld_extracted.size;

	return tld_extracted;
}
Exemplo n.º 2
0
/*
 * Return the starting position of the tld in host or -1 if not found
 * Require a TLD Tree.
 * Ex: 
 * - google.com => 7 (='com')
 * - abc.co.uk  => 4 (='co.uk')
 * - google.nawak => -1 (NOT FOUND)
 *
 */
faup_tld_tree_extracted_t faup_tld_tree_extract(faup_handler_t *fh, TLDNode *tld_tree, const char *org_str)
{
	const char *p;
	int32_t host_last_pos;
	char *last;
	bool found;
	uint32_t step = 0;
	uint32_t tld_len = 0;
	uint32_t tld_exception_len = 0;
	uint32_t last_len_just_for_host = 0;
	faup_tld_tree_extracted_t tld_extracted;

	uint32_t counter;
	bool has_a_dot = false;

	tld_extracted.pos = -1;
	tld_extracted.size = 0;

	if (!tld_tree) {
		fprintf(stderr, "(Error) No TLD Tree given!\n");
		return tld_extracted;
	}

	if (!tld_tree->kid) {
		fprintf(stderr, "(Warning) Cannot extract TLD > 1. Mozilla list does not exists. Please download it (faup -u)\n");
		return tld_extracted;
	}

	last = NULL;

	host_last_pos = fh->faup.features.host.pos + fh->faup.features.host.size;

	p = org_str + host_last_pos - 1; 

	counter = fh->faup.features.host.size - 1;
	while( counter )
	{
		while( *(p-1) && (*p != '.') ) {
			p--;
			counter--;
		}

		step = ( *p == '.' ) ? 1 : 0;
		
		found = faup_tld_tree_tld_exists(tld_tree->kid, p + step, fh->faup.features.host.size - counter - 1);
		if( ! found ) {
			break;
		} else {
			tld_len = fh->faup.features.host.size - counter - 1;
		}

		last = (char *) p + step;

		p--;
		counter--;
	}

	if( last == NULL ) {
		return tld_extracted;
	}


	counter = 0;
	// We want to retrieve the size of the tld without the useless chars the come afterwards
	// www.foo.siemens.om/tagada != www.foo.siemens.om
	last_len_just_for_host = strlen(last) - (strlen(org_str) - (fh->faup.features.host.pos + fh->faup.features.host.size));

	if (tld_tree->sibling) {
		found = faup_tld_tree_tld_exists(tld_tree->sibling, last, last_len_just_for_host);
		if( found )
		{	
			// here we have the longest TLD
			// but is that an exception ? (ex: !siemens.om vs *.om)
			while (counter < tld_len) {
				if (*last != '.') {
					last++;
					tld_exception_len++;
				} else {
					has_a_dot = true;
					break;
				}
				counter++;
			}
		}
	}

	tld_extracted.size = tld_len - tld_exception_len;
	if (!tld_extracted.size) {
		tld_extracted.size = tld_len;
	}

	//printf("fh->faup.features.host.pos(%zd), fh->faup.features.host.size(%zd), tld_extracted.size(%zd), counter(%zd)\n", fh->faup.features.host.pos, fh->faup.features.host.size, tld_extracted.size, counter);
	tld_extracted.pos = fh->faup.features.host.pos + fh->faup.features.host.size - tld_extracted.size;

	if (has_a_dot) {
		tld_extracted.pos += 1;
		tld_extracted.size -= 1;
	}

//	printf("tld_extracted.size=%zd;tld_extracted.pos=%zd\n", tld_extracted.size, tld_extracted.pos);

	return tld_extracted;
}