Example #1
0
void KonqMFIcon::extractCard(DOM::Node node) {
	QString name, value;
	DOM::NodeList nodes = node.childNodes();
	unsigned int n = nodes.length();
	value += "BEGIN:VCARD\nVERSION:3.0\n";
	for (unsigned int i = 0; i < n; ++i) {
		DOM::Node node = nodes.item(i);
		DOM::NamedNodeMap map = node.attributes();
		for (unsigned int j = 0; j < map.length(); ++j) {
			if (map.item(j).nodeName().string() != "class") {
				continue;
			}
			QStringList l = QStringList::split(' ', map.item(j).nodeValue().string());
			for (QStringList::ConstIterator it = l.begin(); it != l.end(); ++it) {
				if (*it == "photo") {
				} else if (*it == "adr") {
					value += "ADR:" + extractAddress(node) + "\n";
				} else if (*it == "tel") {
					value += "TEL;TYPE=VOICE:" + textForNode(node) + "\n";
				} else if (*it == "fn") {
					name = textForNode(node);
					value += "FN:" + name + "\n";
				} else if (*it == "url") {
					DOM::Node at = node.attributes().getNamedItem("href");
					if (!at.isNull()) {
						value += "URL:" + at.nodeValue().string().stripWhiteSpace() + "\n";
					}
				} else if (*it == "email") {
					DOM::Node at = node.attributes().getNamedItem("href");
					if (!at.isNull()) {
						QString v = at.nodeValue().string();
						if (v.startsWith("mailto:")) {
							v = v.mid(7);
						}
						value += "EMAIL:" + v.stripWhiteSpace() + "\n";
					}
				} else if (*it == "org") {
					value += "ORG:" + textForNode(node) + "\n";
				}
			}
		}
	}

	if (!name.isEmpty()) {
		value += "END:VCARD\n";
		_cards.append(qMakePair(name, value));
	}
}
Example #2
0
int getAddress (char* url) {
	char *stream, *text, *textHighlight, *lenstr;
	char *tokens = (char*)malloc(DEF_BUFF_SIZE);
	int *positions;// array to record position of each token in the text
	cvector addressVector;
	Address *adr;

	long len;
	int MAXLEN = 1805;
    int EXTRA = 11;
	/* 4 for field name "data", 1 for "=" */
    int MAXINPUT = MAXLEN+EXTRA+2;
	char input[MAXINPUT];
	char* data = input, *p;
	int rightOrWrong = -1, numRight, numTotal;


	//char* domain_url;
	int i;
	http_setTimeout(8);//seconds 
	//fetch web page
	int ret = httpFetch (url, &stream);
	if (ret == -1) {
		printf("%s\n",http_strerror());
		exit(0);
	}
	//printf("ret: %d, strlen: %d\n",ret, strlen(stream));
	assert(stream);
	text= (char*)malloc(ret+2);
	if (!text) {
		printf("out of memory when convert text to tokens!\n");
		exit(0);
	}

	strncpy(text, stream, ret);
	//append a '\0' to the end of string to make sure it is end with two '\0' for flex to scan
	*(text+ret) = '\0';
	*(text+ret+1) = '\0';
	free(stream);



	/* convert text to tokens, remove tags
	and convert back to string: tokens
	and keep all positions in array "positions"
	*/
	convertToken2Text(text, tokens, &positions);

	// get base domain of given url
	//e.g. given http://www.google.com/address, return http://www.google.com to domain_url
	/* domain_url = (char*)malloc(strlen(url)+1);
	strcpy(domain_url, url);
	for (i=strlen(url); i>0; i--) {
	if (url[i] == '/') {
	if (url[i-1] == '/' ) // is "//"
	break;
	else   // not "//"
	domain_url[i] = '\0';
	}
	}
	printf ("<base href=\"%s%s\">\n", GEO_URL, domain_url);
	free(domain_url);
	*/
	VectorNew (&addressVector, sizeof (Address),free_address, DEF_ADDRESS_PER_PAGE);
	//extract address,
	//get position from positions vector
    //and save extracted address, position, country to addressVector
	extractAddress(tokens, positions, &addressVector);

	//display the parsed text
	//printf("tokens: %s\n",tokens);
	//printf("url: %s\n", url);
	//printf("domain_url: %s\n", domain_url);
	//output header

	printf("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=ISO-8859-1\">\n");
	printf("<table border=1 width=100%%><tr><td><table border=1 bgcolor=#ffffff cellpadding=10 cellspacing=0 width=100%% color=#ffffff><tr><td>\n");
	printf("<font face=arial,sans-serif color=black size=-1>\n");
	printf("<b><a href='%s'>US, UK & Canadian Addresses</a> extracted by <a href='%s'>Geo Extractor</a> from web page</b> <a href='%s'>%s</a></font><br><br>\n",LIST_FILES_URL, HOME_PAGE,url,url);

	//printf("%s,",textHighlight);
	//display extracted address
	//table header
	printf("<table width=100%% border=0 cellpadding=0 cellspacing=0><tr><td bgcolor=#3366cc><img width=1 height=1 alt=''></td></tr></table>\n");
	printf("<table width=100%% border=0 cellpadding=0 cellspacing=0 bgcolor=#e5ecf9><tr><td width=10></td><td bgcolor=#e5ecf9 nowrap><br>\n");
	printf("<font face=arial,sans-serif color=black size=-1><b>\n");

	for (i=0; i<addressVector.ItemsCount; i++) {
		adr = (Address*)VectorNth(&addressVector,i);
		printf("%s<br>\n", adr->address);
		/*printf("%s, start: %d, end: %d<br>\n",adr->address, adr->start, adr->end);
		for (j=adr->start; j<=adr->end; j++)
			printf("%c",*(text+j));
		printf("\n");
		*/

	}
	printf("</b></font>\n");
	printf("<br></td></tr></table>\n");
	printf("<table width=100%% border=0 cellpadding=0 cellspacing=0><tr><td bgcolor=#3366cc><img width=1 height=1 alt=''></td></tr></table>\n");

	textHighlight = (char*)malloc(DEF_BUFF_SIZE);
	numRight=numTotal =addressVector.ItemsCount;

	/* if there is a user post, we save the user input to get tagged data*/
	lenstr = getenv("CONTENT_LENGTH");
	if ( !(lenstr == NULL || sscanf(lenstr,"%ld",&len)!=1 || len > MAXLEN) ) {
		tagAddress(text, textHighlight, &addressVector);
		fgets(input, len+1, stdin);
		URLdecode(input);
		data = input+EXTRA;
		//printf("posted: %s\n",data);
		len = strlen("right");
		if ( strncmp(data, "right", len)==0 ) { 
			rightOrWrong = 0; //set flag for right or wrong extraction
		}
		len = strlen("wrong");
		if ( strncmp(data, "wrong", len)==0 ) {
			rightOrWrong = 1; //user input "Wrong Extraction"
		}

		// get user input: numRight, which is number of correct extracted address
		data += strlen("right") + strlen("&numRight=");
		p = data;
		while (*data++ !='&');
		*data= '\0';
		numRight = atoi(p);
		//printf("numRight: %d\n", numRight);

		// get user input numTotal, which is number of total address in the page
		p = data+strlen("numTotal=");
		numTotal = atoi(p);
		//printf("numTotal: %d\n", numTotal);



		if (rightOrWrong == 0) {
			//printf("webpage saved to RIGHT folder\n");
			saveTaggedText(url, text, textHighlight, rightOrWrong, numRight, addressVector.ItemsCount, numTotal);
		}
		if ( rightOrWrong == 1 ) {
			//printf("webpage saved to WRONG folder\n");
			saveTaggedText(url, text, textHighlight, rightOrWrong, numRight, addressVector.ItemsCount, numTotal);
		}

		//printf("tagged text: %s\n", textHighlight);
	}

	// give source text, and addressVector
	//highlight all extracted address in the webpage
	getHighlight(text, textHighlight, &addressVector);


	/* if there is at least one address extracted, show user input to let user 
	judge where extraction is correct*/


	if ( SHOW_COLLECT_DATA_INTERFACE) {

		printf("<FORM ACTION=\"%s%s\" METHOD=\"POST\">\n", GEO_URL, url);
		printf("<font face=arial,sans-serif color=black size=-1>\n");
		
		printf("<P><input name=\"extraction\" type=\"radio\" value=\"right\" ");
		if ((rightOrWrong == 0)||(rightOrWrong == -1)) //if no user input or user input: extracted address all correct
			printf("checked");
		printf("> All address extracted correctly<br>\n");
		printf("<input name=\"extraction\" type=\"radio\" value=\"wrong\" ");
		if (rightOrWrong == 1) //user input: extracted address all correct
			printf("checked");
		printf("> Not all addresses extracted correctly. \n");
		printf("<input type=\"text\" name=\"numRight\" size=\"4\" value=\"%d\"> addresses extracted correctly from total <input type=\"text\" name=\"numTotal\" size=\"4\" value=\"%d\"> addresses<BR>\n", numRight, numTotal);
		printf("<INPUT TYPE=\"SUBMIT\" VALUE=\"Save Webpage\"></font></FORM>\n");
		//show google search
		printf("<SCRIPT language=\"JavaScript\">function OnSubmitForm(){ document.g.action =\"%shttp://www.google.com/search?num=100&q=\"+document.g.q.value.replace(\" \",\"%%2B\");}</SCRIPT>\n", GEO_URL);
		printf("<table border=0 align=right><tr><td>\n");
		printf("<form action=\"\" method=\"post\" name=\"g\" onSubmit=\"return OnSubmitForm();\">\n");
		printf("<input size=\"32\" name=\"q\">\n");
		printf("<INPUT TYPE=\"SUBMIT\" name=\"Submit\" VALUE=\"Google\"></form>\n");
		printf("</td></tr></table>\n");

	}
	printf("</td></tr></table></td></tr></table>\n");



	//extract address from original html text
  // extract_address(text);
 // printf("Original <hr>%s",text);
  printf("<hr>\n");
  //printf("%s",textHighlight);
  displayHtmlAbsoluteURL(textHighlight, url);

  VectorDispose(&addressVector);
  free (positions);
  free (text);
  free (tokens);
  

  return 0;

}