/* * Return the starting position of the tld in host or -1 if not found * Require a TLD Tree. * Ex: * - google.com => 7 (='com') * - abc.co.uk => 4 (='co.uk') * - google.nawak => -1 (NOT FOUND) * */ faup_tld_tree_extracted_t faup_tld_tree_extract(TLDNode *tld_tree, const char *org_str, faup_feature_t host) { const char *p; char *last; bool found; int step; faup_tld_tree_extracted_t tld_extracted; tld_extracted.pos = -1; tld_extracted.size = 0; last = NULL; p = org_str + host.pos + host.size - 1; while( *p ) { while( *(p-1) && (*p != '.') ) p--; step = ( *p == '.' ) ? 1 : 0; found = faup_tld_tree_tld_exists(tld_tree->kid, p + step); if( ! found ) break; last = (char *) p + step; p--; } if( last == NULL ) return tld_extracted; // here we have the longest TLD // but is that an exception ? (ex: !siemens.om vs *.om) found = faup_tld_tree_tld_exists(tld_tree->sibling, last); if( found ) { while( *(last) != '.' ) last++; last++; } tld_extracted.size = strlen(last); tld_extracted.pos = host.pos + host.size - tld_extracted.size; return tld_extracted; }
/* * Return the starting position of the tld in host or -1 if not found * Require a TLD Tree. * Ex: * - google.com => 7 (='com') * - abc.co.uk => 4 (='co.uk') * - google.nawak => -1 (NOT FOUND) * */ faup_tld_tree_extracted_t faup_tld_tree_extract(faup_handler_t *fh, TLDNode *tld_tree, const char *org_str) { const char *p; int32_t host_last_pos; char *last; bool found; uint32_t step = 0; uint32_t tld_len = 0; uint32_t tld_exception_len = 0; uint32_t last_len_just_for_host = 0; faup_tld_tree_extracted_t tld_extracted; uint32_t counter; bool has_a_dot = false; tld_extracted.pos = -1; tld_extracted.size = 0; if (!tld_tree) { fprintf(stderr, "(Error) No TLD Tree given!\n"); return tld_extracted; } if (!tld_tree->kid) { fprintf(stderr, "(Warning) Cannot extract TLD > 1. Mozilla list does not exists. Please download it (faup -u)\n"); return tld_extracted; } last = NULL; host_last_pos = fh->faup.features.host.pos + fh->faup.features.host.size; p = org_str + host_last_pos - 1; counter = fh->faup.features.host.size - 1; while( counter ) { while( *(p-1) && (*p != '.') ) { p--; counter--; } step = ( *p == '.' ) ? 1 : 0; found = faup_tld_tree_tld_exists(tld_tree->kid, p + step, fh->faup.features.host.size - counter - 1); if( ! found ) { break; } else { tld_len = fh->faup.features.host.size - counter - 1; } last = (char *) p + step; p--; counter--; } if( last == NULL ) { return tld_extracted; } counter = 0; // We want to retrieve the size of the tld without the useless chars the come afterwards // www.foo.siemens.om/tagada != www.foo.siemens.om last_len_just_for_host = strlen(last) - (strlen(org_str) - (fh->faup.features.host.pos + fh->faup.features.host.size)); if (tld_tree->sibling) { found = faup_tld_tree_tld_exists(tld_tree->sibling, last, last_len_just_for_host); if( found ) { // here we have the longest TLD // but is that an exception ? (ex: !siemens.om vs *.om) while (counter < tld_len) { if (*last != '.') { last++; tld_exception_len++; } else { has_a_dot = true; break; } counter++; } } } tld_extracted.size = tld_len - tld_exception_len; if (!tld_extracted.size) { tld_extracted.size = tld_len; } //printf("fh->faup.features.host.pos(%zd), fh->faup.features.host.size(%zd), tld_extracted.size(%zd), counter(%zd)\n", fh->faup.features.host.pos, fh->faup.features.host.size, tld_extracted.size, counter); tld_extracted.pos = fh->faup.features.host.pos + fh->faup.features.host.size - tld_extracted.size; if (has_a_dot) { tld_extracted.pos += 1; tld_extracted.size -= 1; } // printf("tld_extracted.size=%zd;tld_extracted.pos=%zd\n", tld_extracted.size, tld_extracted.pos); return tld_extracted; }