void KonqMFIcon::extractCard(DOM::Node node) { QString name, value; DOM::NodeList nodes = node.childNodes(); unsigned int n = nodes.length(); value += "BEGIN:VCARD\nVERSION:3.0\n"; for (unsigned int i = 0; i < n; ++i) { DOM::Node node = nodes.item(i); DOM::NamedNodeMap map = node.attributes(); for (unsigned int j = 0; j < map.length(); ++j) { if (map.item(j).nodeName().string() != "class") { continue; } QStringList l = QStringList::split(' ', map.item(j).nodeValue().string()); for (QStringList::ConstIterator it = l.begin(); it != l.end(); ++it) { if (*it == "photo") { } else if (*it == "adr") { value += "ADR:" + extractAddress(node) + "\n"; } else if (*it == "tel") { value += "TEL;TYPE=VOICE:" + textForNode(node) + "\n"; } else if (*it == "fn") { name = textForNode(node); value += "FN:" + name + "\n"; } else if (*it == "url") { DOM::Node at = node.attributes().getNamedItem("href"); if (!at.isNull()) { value += "URL:" + at.nodeValue().string().stripWhiteSpace() + "\n"; } } else if (*it == "email") { DOM::Node at = node.attributes().getNamedItem("href"); if (!at.isNull()) { QString v = at.nodeValue().string(); if (v.startsWith("mailto:")) { v = v.mid(7); } value += "EMAIL:" + v.stripWhiteSpace() + "\n"; } } else if (*it == "org") { value += "ORG:" + textForNode(node) + "\n"; } } } } if (!name.isEmpty()) { value += "END:VCARD\n"; _cards.append(qMakePair(name, value)); } }
int getAddress (char* url) { char *stream, *text, *textHighlight, *lenstr; char *tokens = (char*)malloc(DEF_BUFF_SIZE); int *positions;// array to record position of each token in the text cvector addressVector; Address *adr; long len; int MAXLEN = 1805; int EXTRA = 11; /* 4 for field name "data", 1 for "=" */ int MAXINPUT = MAXLEN+EXTRA+2; char input[MAXINPUT]; char* data = input, *p; int rightOrWrong = -1, numRight, numTotal; //char* domain_url; int i; http_setTimeout(8);//seconds //fetch web page int ret = httpFetch (url, &stream); if (ret == -1) { printf("%s\n",http_strerror()); exit(0); } //printf("ret: %d, strlen: %d\n",ret, strlen(stream)); assert(stream); text= (char*)malloc(ret+2); if (!text) { printf("out of memory when convert text to tokens!\n"); exit(0); } strncpy(text, stream, ret); //append a '\0' to the end of string to make sure it is end with two '\0' for flex to scan *(text+ret) = '\0'; *(text+ret+1) = '\0'; free(stream); /* convert text to tokens, remove tags and convert back to string: tokens and keep all positions in array "positions" */ convertToken2Text(text, tokens, &positions); // get base domain of given url //e.g. given http://www.google.com/address, return http://www.google.com to domain_url /* domain_url = (char*)malloc(strlen(url)+1); strcpy(domain_url, url); for (i=strlen(url); i>0; i--) { if (url[i] == '/') { if (url[i-1] == '/' ) // is "//" break; else // not "//" domain_url[i] = '\0'; } } printf ("<base href=\"%s%s\">\n", GEO_URL, domain_url); free(domain_url); */ VectorNew (&addressVector, sizeof (Address),free_address, DEF_ADDRESS_PER_PAGE); //extract address, //get position from positions vector //and save extracted address, position, country to addressVector extractAddress(tokens, positions, &addressVector); //display the parsed text //printf("tokens: %s\n",tokens); //printf("url: %s\n", url); //printf("domain_url: %s\n", domain_url); //output header printf("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=ISO-8859-1\">\n"); printf("<table border=1 width=100%%><tr><td><table border=1 bgcolor=#ffffff cellpadding=10 cellspacing=0 width=100%% color=#ffffff><tr><td>\n"); printf("<font face=arial,sans-serif color=black size=-1>\n"); printf("<b><a href='%s'>US, UK & Canadian Addresses</a> extracted by <a href='%s'>Geo Extractor</a> from web page</b> <a href='%s'>%s</a></font><br><br>\n",LIST_FILES_URL, HOME_PAGE,url,url); //printf("%s,",textHighlight); //display extracted address //table header printf("<table width=100%% border=0 cellpadding=0 cellspacing=0><tr><td bgcolor=#3366cc><img width=1 height=1 alt=''></td></tr></table>\n"); printf("<table width=100%% border=0 cellpadding=0 cellspacing=0 bgcolor=#e5ecf9><tr><td width=10></td><td bgcolor=#e5ecf9 nowrap><br>\n"); printf("<font face=arial,sans-serif color=black size=-1><b>\n"); for (i=0; i<addressVector.ItemsCount; i++) { adr = (Address*)VectorNth(&addressVector,i); printf("%s<br>\n", adr->address); /*printf("%s, start: %d, end: %d<br>\n",adr->address, adr->start, adr->end); for (j=adr->start; j<=adr->end; j++) printf("%c",*(text+j)); printf("\n"); */ } printf("</b></font>\n"); printf("<br></td></tr></table>\n"); printf("<table width=100%% border=0 cellpadding=0 cellspacing=0><tr><td bgcolor=#3366cc><img width=1 height=1 alt=''></td></tr></table>\n"); textHighlight = (char*)malloc(DEF_BUFF_SIZE); numRight=numTotal =addressVector.ItemsCount; /* if there is a user post, we save the user input to get tagged data*/ lenstr = getenv("CONTENT_LENGTH"); if ( !(lenstr == NULL || sscanf(lenstr,"%ld",&len)!=1 || len > MAXLEN) ) { tagAddress(text, textHighlight, &addressVector); fgets(input, len+1, stdin); URLdecode(input); data = input+EXTRA; //printf("posted: %s\n",data); len = strlen("right"); if ( strncmp(data, "right", len)==0 ) { rightOrWrong = 0; //set flag for right or wrong extraction } len = strlen("wrong"); if ( strncmp(data, "wrong", len)==0 ) { rightOrWrong = 1; //user input "Wrong Extraction" } // get user input: numRight, which is number of correct extracted address data += strlen("right") + strlen("&numRight="); p = data; while (*data++ !='&'); *data= '\0'; numRight = atoi(p); //printf("numRight: %d\n", numRight); // get user input numTotal, which is number of total address in the page p = data+strlen("numTotal="); numTotal = atoi(p); //printf("numTotal: %d\n", numTotal); if (rightOrWrong == 0) { //printf("webpage saved to RIGHT folder\n"); saveTaggedText(url, text, textHighlight, rightOrWrong, numRight, addressVector.ItemsCount, numTotal); } if ( rightOrWrong == 1 ) { //printf("webpage saved to WRONG folder\n"); saveTaggedText(url, text, textHighlight, rightOrWrong, numRight, addressVector.ItemsCount, numTotal); } //printf("tagged text: %s\n", textHighlight); } // give source text, and addressVector //highlight all extracted address in the webpage getHighlight(text, textHighlight, &addressVector); /* if there is at least one address extracted, show user input to let user judge where extraction is correct*/ if ( SHOW_COLLECT_DATA_INTERFACE) { printf("<FORM ACTION=\"%s%s\" METHOD=\"POST\">\n", GEO_URL, url); printf("<font face=arial,sans-serif color=black size=-1>\n"); printf("<P><input name=\"extraction\" type=\"radio\" value=\"right\" "); if ((rightOrWrong == 0)||(rightOrWrong == -1)) //if no user input or user input: extracted address all correct printf("checked"); printf("> All address extracted correctly<br>\n"); printf("<input name=\"extraction\" type=\"radio\" value=\"wrong\" "); if (rightOrWrong == 1) //user input: extracted address all correct printf("checked"); printf("> Not all addresses extracted correctly. \n"); printf("<input type=\"text\" name=\"numRight\" size=\"4\" value=\"%d\"> addresses extracted correctly from total <input type=\"text\" name=\"numTotal\" size=\"4\" value=\"%d\"> addresses<BR>\n", numRight, numTotal); printf("<INPUT TYPE=\"SUBMIT\" VALUE=\"Save Webpage\"></font></FORM>\n"); //show google search printf("<SCRIPT language=\"JavaScript\">function OnSubmitForm(){ document.g.action =\"%shttp://www.google.com/search?num=100&q=\"+document.g.q.value.replace(\" \",\"%%2B\");}</SCRIPT>\n", GEO_URL); printf("<table border=0 align=right><tr><td>\n"); printf("<form action=\"\" method=\"post\" name=\"g\" onSubmit=\"return OnSubmitForm();\">\n"); printf("<input size=\"32\" name=\"q\">\n"); printf("<INPUT TYPE=\"SUBMIT\" name=\"Submit\" VALUE=\"Google\"></form>\n"); printf("</td></tr></table>\n"); } printf("</td></tr></table></td></tr></table>\n"); //extract address from original html text // extract_address(text); // printf("Original <hr>%s",text); printf("<hr>\n"); //printf("%s",textHighlight); displayHtmlAbsoluteURL(textHighlight, url); VectorDispose(&addressVector); free (positions); free (text); free (tokens); return 0; }