示例#1
0
bool
SimpleHTMLParser::parse(char * buffer, int n)
{
	enum { START, TAG, SCRIPT, ANCHOR, HREF,
	       COMMENT, FRAME, SRC, TITLE, METAK, METAD, METAN } state;//added case TITLE

	state = START;
	
	char * temp = (char*)malloc(1000*sizeof(char));//added
	char * in = temp;
	int flag = 0;	
	int ent = 0;	
	char * bufferEnd = buffer + n;
	char * b = buffer;
	bool lastCharSpace = false;
	while (b < bufferEnd) {
		//printf("<%c,%d,%d>", *b, *b,state);
		switch (state) {
		case START: {
			if (match(&b,"<SCRIPT")) {
				state = SCRIPT;
			}
			else if (match(&b,"<TITLE>"))//added this elif
			{
				state = TITLE;
			}
			else if(match(&b,"<META NAME=\"KEYWORDS\" CONTENT=\""))//added METAK
				{
					state = METAK;
				}
			else if(match(&b,"<META NAME=\"DESCRIPTION\" CONTENT=\""))//added METAD
				{
					state = METAD;
				}
			else if(match(&b,"<META CONTENT=\""))//added METAN
				{
					state = METAN;
				}				
			else if (match(&b,"<!--")) {
				state = COMMENT;
			}
			else if (match(&b,"<A ")) {
				state = ANCHOR;
			}
			else if (match(&b,"<FRAME ")) {
				state = FRAME;
			}
			else if	(match(&b,"<")) {
				state = TAG;
			}
			
			else {
				char c = *b;
				//Substitute one or more blank chars with a single space
				/*if (c=='\n'||c=='\r'||c=='\t'||c==' ') {
					if (!lastCharSpace) {
						onContentFound(' ');
					}
					lastCharSpace = true;
				}
				else {
					onContentFound(c);
					lastCharSpace = false;
				}*/
				
				b++;
			}
			break;
		}
		case ANCHOR: {
			if (match(&b,"href=\"")) {
				state = HREF;
				urlAnchorLength=0;
				//printf("href=");
			}
			else if (match(&b,">")) {
				// End script
				state = START;
			}
			else {
				b++;
			}
			break;
				
		}
		case HREF: {
			if (match(&b,"\"")) {
				// Found ending "
				state = ANCHOR;
				urlAnchor[urlAnchorLength] = '\0';
				onAnchorFound(urlAnchor);
				//printf("\n");
			}
			else {
				if ( urlAnchorLength < MaxURLLength-1) {
					urlAnchor[urlAnchorLength] = *b;
					urlAnchorLength++;
				}
				//printf("%c", *b, *b);
				b++;
			}
			break;
		}
		case TITLE://addded this case
		{
		if(ent == 0)
		{
			onContentFound(']');
			ent ++;
		}
			
		if(match(&b,"</title>"))
			{
			onContentFound('\n');
			onContentFound('[');
			ent = 0;
			state = START;
			}
			else 
			{
			onContentFound(*b);	
			b++;
			}
			break;
		}
		case FRAME: {
			if (match(&b,"src=\"")) {
				state = SRC;
				urlAnchorLength=0;
				//printf("href=");
			}
			else if (match(&b,">")) {
				// End script
				state = START;
			}
			else {
				b++;
			}
			break;
				
		}
		case METAK://adding
			{
			if(ent == 0)
		{
			onContentFound('/');
			ent ++;
		}
			if(match(&b,"\" />"))
			{
				onContentFound('[');
				ent = 0;
				state = START;
				break;
			}
			//printf("ninet\n");
			//if(match(&b,))
			onContentFound(*b);	
			b++;
			break;	
			}
		case METAD://adding
			{
			if(ent == 0)
		{
			onContentFound(':');
			ent ++;
		}
			if(match(&b,"\" />")||match(&b,">"))
			{
				onContentFound('[');
				ent = 0;
				state = START;
				break;
			}
			//printf("ninet\n");
			//if(match(&b,))
			onContentFound(*b);	
			b++;
			break;	
			}
		case METAN://added
			{
				//if(match(&b,"NAME=\"DESCRIPTION\"/>") || match(&b,"NAME=\"KEYWORDS\"/>"))
				/*if(match(&b,"/>"))
				{
					onContentFound('[');
					printf("END\n");
					state = START;
					break;	
				}*/	
				//if(ent ==0)
				//{
				//	ent = 1;
				//	onContentFound(';');
				//}
				flag  = 0;
				for(int i = 0;i<strlen(temp);i++)
					{
						*temp ='\0';
						temp ++;						
						}
				temp = in;
				while(!match(&b,"/>"))//&&(!match(&b,">")))
				{ 
			/*	if(match(&b,">"))//kind of added
					{
						flag = 1;
					ent = 2;	
					state = START;
					break;
						//break;
					}*/
				//printf("y0u\n");
				*temp = *b;	
				//onContentFound(*temp);
				//printf("%c\n",*temp);
				//onContentFound(*b);
				//b++;
				//temp++;
				if(match(&b,"NAME=\"DESCRIPTION\"/>"))// || match(&b,"NAME=\"KEYWORDS\"/>"))
				{
					//onContentFound(':');
					//temp = in;
					//printf("entry\n");
					flag = 1;
					ent = 1;
					state = START;
					break;
				}
				else if(match(&b,"NAME=\"KEYWORDS\"/>"))
				{
					//onContentFound('/');
					//temp = in;
					//printf("entry\n");
					flag = 1;
					ent = 2;	
					state = START;
					break;	
				}
				else if(match(&b,"\">"))
					{
						temp = in;
						state = START;
						flag = 0;
						ent = 0;
						break;		
					}
			b++;
			temp++;
			}
	//printf("jk: %s\n", in);
			*temp = '\0';
			temp = in;
			
			int l = 0;
//printf("%ld\n",strlen(in));
                if(flag == 0)
				{
					state = START;
					temp = in;
					ent = 0;	
					break;
				}
			
			else
 				{
				if(ent == 1)
				{
					onContentFound(':');
					while(l<(strlen(in)))
				{
					//onContentFound(*(temp+l));
					//printf("w1 %c\n",*(temp+l));	
					onContentFound(*temp);			
					l++;
					temp++;
				}		
				//do something
				}	
				else
					{
						onContentFound('/');
						//do somefink else
						while((l<(strlen(in)) && (*temp!='\"')))
				{
					//onContentFound(*(temp+l));
					//printf("w1 %c\n",*(temp+l));	
					onContentFound(*temp);
					//printf("*temp is %c",*temp);			
					l++;
					temp++;
				}	
						}	
				/*while(l<(strlen(in)))
				{
					//onContentFound(*(temp+l));
					//printf("w1 %c\n",*(temp+l));	
					onContentFound(*temp);			
					l++;
					temp++;
				}*/	
				}
			temp = in;
			/*for(int i = 0; i<strlen(temp);i++)
			{
			*(temp+i) = '\0';	
			}*/
			onContentFound('[');
			state = START;	
			break;
		}	
		case SRC: {
			if (match(&b,"\"")) {
				// Found ending "
				state = FRAME;
				urlAnchor[urlAnchorLength] = '\0';
				onAnchorFound(urlAnchor);
				//printf("\n");
			}
			else {
				if ( urlAnchorLength < MaxURLLength-1) {
					urlAnchor[urlAnchorLength] = *b;
					urlAnchorLength++;
				}
				//printf("%c", *b, *b);
				b++;
			}
			break;
		}
		case SCRIPT: {
			if (match(&b,"/SCRIPT>")) {
				// End script
				state = START;
			}
			else {
				b++;
			}
			break;
		}
		case COMMENT: {
			if (match(&b,"-->")) {
				// End comments
				state = START;
			}
			else {
				b++;
			}
			break;
		}
		case TAG: {
			if (match(&b, ">")) {
				state = START;
			}
			else {
				b++;
			}
			break;
		}
		default:;
		}
		
	}
}
示例#2
0
bool
SimpleHTMLParser::parse(char * buffer, int n)
{
	enum { START, TAG, SCRIPT, ANCHOR, HREF,
	       COMMENT, FRAME, SRC } state;

	state = START;
	
	char * bufferEnd = buffer + n;
	char * b = buffer;
	bool lastCharSpace = false;
	while (b < bufferEnd) {
		//printf("<%c,%d,%d>", *b, *b,state);
		switch (state) {
		case START: {
			if (match(&b,"<SCRIPT")) {
				state = SCRIPT;
			}
			else if (match(&b,"<!--")) {
				state = COMMENT;
			}
			else if (match(&b,"<A ")) {
				state = ANCHOR;
			}
			else if (match(&b,"<FRAME ")) {
				state = FRAME;
			}
			else if	(match(&b,"<")) {
				state = TAG;
			}
			else {
				char c = *b;
				//Substitute one or more blank chars with a single space
				if (c=='\n'||c=='\r'||c=='\t'||c==' ') {
					if (!lastCharSpace) {
						onContentFound(' ');
					}
					lastCharSpace = true;
				}
				else {
					onContentFound(c);
					lastCharSpace = false;
				}
				
				b++;
			}
			break;
		}
		case ANCHOR: {
			if (match(&b,"href=\"")) {
				state = HREF;
				urlAnchorLength=0;
				//printf("href=");
			}
			else if (match(&b,">")) {
				// End script
				state = START;
			}
			else {
				b++;
			}
			break;
				
		}
		case HREF: {
			if (match(&b,"\"")) {
				// Found ending "
				state = ANCHOR;
				urlAnchor[urlAnchorLength] = '\0';
				onAnchorFound(urlAnchor);
				//printf("\n");
			}
			else {
				if ( urlAnchorLength < MaxURLLength-1) {
					urlAnchor[urlAnchorLength] = *b;
					urlAnchorLength++;
				}
				//printf("%c", *b, *b);
				b++;
			}
			break;
		}
		case FRAME: {
			if (match(&b,"src=\"")) {
				state = SRC;
				urlAnchorLength=0;
				//printf("href=");
			}
			else if (match(&b,">")) {
				// End script
				state = START;
			}
			else {
				b++;
			}
			break;
				
		}
		case SRC: {
			if (match(&b,"\"")) {
				// Found ending "
				state = FRAME;
				urlAnchor[urlAnchorLength] = '\0';
				onAnchorFound(urlAnchor);
				//printf("\n");
			}
			else {
				if ( urlAnchorLength < MaxURLLength-1) {
					urlAnchor[urlAnchorLength] = *b;
					urlAnchorLength++;
				}
				//printf("%c", *b, *b);
				b++;
			}
			break;
		}
		case SCRIPT: {
			if (match(&b,"/SCRIPT>")) {
				// End script
				state = START;
			}
			else {
				b++;
			}
			break;
		}
		case COMMENT: {
			if (match(&b,"-->")) {
				// End comments
				state = START;
			}
			else {
				b++;
			}
			break;
		}
		case TAG: {
			if (match(&b, ">")) {
				state = START;
			}
			else {
				b++;
			}
			break;
		}
		default:;
		}
		
	}
}