Exemple #1
0
char * fetchHTML( const char * url, int * n)
{
	// Open URL
	char contentType[ MaxLineLength ];
	FILE * f = openhttp( url, contentType );
	if ( f == NULL ) {
		return NULL;
	}
	
	if (strcmp(contentType, "text/html")!=0) {
		// Not HTML
		printf("Content Type %s not text/html. Type: %s\n", url, contentType);
		fclose(f);
		return NULL;
	}

	// Allocate an initial buffer
	int memIncrement = 8 * 1024;
	int maxBuffer = memIncrement;
	char * buffer = NULL;
	if (buffer == NULL) {
		buffer = (char *) malloc( maxBuffer );
		if (buffer == NULL) {
			perror("malloc");
			return NULL;
		}
	}
	
	int l;
	*n = 0;
	while ( (l=fread( buffer + *n, 1, maxBuffer - *n, f)) > 0) {
		*n += l;
		if ( maxBuffer - *n == 0 ) {
			// Increase buffer size
			maxBuffer += memIncrement;
			buffer = (char *) realloc(buffer, maxBuffer);
		}
	}
		
	if ( *n < 0) {
		fclose(f);
		free(buffer);
		return NULL;
	}
	
	fclose( f );
	
	return buffer;
}
Exemple #2
0
int
main( int argc, char ** argv )
{
  // Skip command
  argv++;
  // Process the arguments
  if ( !strcmp(*argv,"-h") ) {
    printUsage();
    exit(1);
  }
  
  if ( !strcmp(*argv,"-t") ) {
    ttag = true;
    argv++; //argv[0] should be the html. The following NULL test still applies
  }
  if ( !strcmp(*argv,"-a") ) {
    atag = true;
    argv++; //argv[0] should be the html. The following NULL test still applies
  }

  if ( *argv == NULL ) {
    // Print usage also if no URL after the arguments
    printUsage();
    exit(1);
  }

  // Open URL
  char contentType[ MaxLineLength ];
  char * url = *argv;


  FILE * f = openhttp( url, contentType );
  if ( f == NULL ) {
    exit(1);
  }

  if(ttag == false && atag == false){ //no -t tag
    // Print the content type
    printf( "Content Type: \"%s\"\n", contentType );

    // Print to stdout line by line
    char line[ MaxLineLength ];
    while ( fgets( line,MaxLineLength, f ) ) {
      fputs( line, stdout );
    }
  }
  else{//-t or -a argument
    if(strcmp(contentType,"text/html") != 0){
      printf("Error: document is not text/html\n");
      exit(0);
    }
    printf( "Content Type: \"%s\"\n", contentType );
    char line[ MaxLineLength ];
    while ( fgets( line,MaxLineLength, f ) ) {
      strcat(html, line);
    }

    ctagp = html;
    if(atag == true){

      char* begin = urls;
      char * base = strdup(url);
      //printf("base = %s\n",base + strlen(base) - 6);      
      if(strncmp(base + strlen(base) - 5,".html",5)==0  || strncmp(base + strlen(base) - 5,".htm",4)==0 ||  strncmp(base + strlen(base) - 5,".shtml",4)==0 )
	while(base[strlen(base)-1] != '/')
	  base[strlen(base)-1] = '\0';
      if(base[strlen(base)-1] == '/')
	base[strlen(base)-1] = '\0';


      ctag = regcmp("<[Aa] [hH][Rr][eE][fF]=\"", NULL);
      ctag2 = regcmp("\"", NULL);
      while(1){
	if( (ctagp = regex(ctag,ctagp)) == NULL)
	  break;
	if( (ctag2p = regex(ctag2,ctagp)) == NULL)
	  break;
	ctag2p--; //1 less that "
	if(strncmp(ctagp,"mailto:",7)==0 || ctagp[0] == '#'){
	  continue;
	}
	else if( (strncmp(ctagp,"http",4)!=0) && (strncmp(ctagp,"ftp://",6)!=0)) {
	  if(ctagp[0] == '/'){
	    //get to the root of the base tag
	    int index = 0;
	    int slashes = 0;
	    while(slashes!=3){
	      urls[index++] = base[index++];
	      if(base[index] == '/' || base[index] == '\0')
		slashes++;
	    }
	    urls += index;	      
	  }
	  else if(strncmp(ctagp,"../",3) == 0){ //move back x../ directories in base
	    char * newbase = strdup(base);
	    //printf("newbase\n");
	    while(strncmp(ctagp,"../",3) == 0){
	      while(newbase[strlen(newbase)-1] != '/'){
		newbase[strlen(newbase)-1] = '\0';		
	      }
	      if(newbase[strlen(newbase)-1] == '/')
		newbase[strlen(newbase)-1] = '\0';
	      ctagp +=3;
	    }
	    strncpy(urls,newbase,strlen(newbase));
	    urls += strlen(newbase);
	    urls[0] = '/';
	    urls++;
	    free(newbase);
	  }
	  else{
	    strncpy(urls,base,strlen(base));
	    urls += strlen(base);
	    urls[0] = '/';
	    urls++;
	  }
	}
	strncpy(urls,ctagp,ctag2p-ctagp);
	urls +=ctag2p-ctagp;
	urls[0] = '\n';
	urls++;
      }
      

      urls[0] = '\0';

      printf("%s", begin);
      delete(html);
      delete(begin);
      fclose( f );  
      exit(0);
    }


    if ((ctag = regcmp("<[Hh][tT][Mm][Ll]", NULL))==NULL){
      printf("invalid expression exitting\n");
      exit(0);
    }
    ctag = regcmp(">",NULL);
    ctagp = regex(ctag,html);
    ctag2p = regex(ctag,html);
    html[0] = NULL;
    strcat(html, ctagp);

    //remove scripts from html
    ctag = regcmp("<script", NULL); //[ -=?-~]*>
    ctag2 = regcmp("</script>", NULL);
    //printf("%s\n\n\n",html);
    //printf("length = %d",strlen(html));
    ctagp = html;
    while(1){
      if( (ctagp = regex(ctag,ctagp)) == NULL)
	break; //no script tag found
      ctagp = __loc1; //begging of <scipt ...>
      if( (ctag2p = regex(ctag2,ctagp)) == NULL) //end of </script>
	break;
      //html needs a null character where ctagp is pointing to and should be
      //concatenated with ctag2p to remove script
      //printf("right number is %d\n", __loc1-html);
      //html[ctagp-html] = ' ';
      //html[ctagp-html+1] = NULL;
      //strcat(html, ctag2p);
      //new function
      while(ctagp != ctag2p){
	ctagp[0] = ' ';
	ctagp++;
      }

    }//scripts removed, now remove comments
    free(ctag);
    free(ctag2);
    ctag = regcmp("<!--", NULL); //[ -=?-~]*>
    ctag2 = regcmp("-->", NULL);
    ctagp = html;
    while(1){
      if( (ctagp = regex(ctag,ctagp)) == NULL)
	break; //no comment tag found
      ctagp = __loc1; //beginning of comment <!--
      if( (ctag2p = regex(ctag2,ctagp)) == NULL) //end of comment -->
	break;
      //html needs a null character where ctagp is pointing to and should be
      //concatenated with ctag2p to remove script
      while(ctagp != ctag2p){
	ctagp[0] = ' ';
	ctagp++;
      }
    }//comments removed, remove other tags
    free(ctag);
    free(ctag2);
    ctag = regcmp("<", NULL); //[ -=?-~]*>
    ctag2 = regcmp(">", NULL);
    ctagp = html;
    while(1){
      if( (ctagp = regex(ctag,ctagp)) == NULL)
	break;
      ctagp = __loc1;
      if( (ctag2p = regex(ctag2,ctagp)) == NULL) //find next > starting at <
	break;
      while(ctagp != ctag2p){
	ctagp[0] = ' ';
	ctagp++;
      }
    }
    //tags removed, now remove anything from & to ;
    free(ctag);
    free(ctag2);
    ctag = regcmp("&", NULL);
    ctag2 = regcmp(";", NULL);
    ctagp = html;
    while(1){
      if( (ctagp = regex(ctag,ctagp)) == NULL)
	break;
      ctagp = __loc1;
      if( (ctag2p = regex(ctag2,ctagp)) == NULL) //find next ; starting at &
	break;
      while(ctagp != ctag2p){
	ctagp[0] = ' ';
	ctagp++;
      }
    }
    //find punctuation that will be cut
    //for "'" character (i.e can't or Joe's): you can ignore every letter 
    //after "'" and keep the part of the word before 
    free(ctag);
    free(ctag2);
    ctag = regcmp("'", NULL);
    ctag2 = regcmp(" ", NULL);
    ctagp = html;
    while(1){
      if( (ctagp = regex(ctag,ctagp)) == NULL)
	break;
      ctagp = __loc1;
      if( (ctag2p = regex(ctag2,ctagp)) == NULL)
	break;
       while(ctagp != ctag2p){
	ctagp[0] = ' ';
	ctagp++;
      }
    }//remove punctuation
    free(ctag);
    free(ctag2);
    
    //ctag = regcmp("[!-@[-`{-~]", NULL);
    int y = 0;
    while(html[y] != '\0'){
      //if( (html[i] > '!' && html[i]<'@') || (html[i] > '[' && html[i]<'`') || (html[i] > '{' && html[i]<']' || html[i] ==  )
      if( (html[y]>='a' && html[y]<='z') || (html[y]>='A' && html[y]<='Z') || html[y] ==' ' ||  html[y] =='\n' ||  html[y] == '\t')
	y++;
      else{
	html[y] = ' ';
	y++;
      }
    }
    //while(1){
    //if( (ctagp = regex(ctag,html)) == NULL)
    //break;
    //html[__loc1-html] = ' ';
    //printf("i = %d\n",i++);
    //}
    trim(html);
    //printf("length = %d\n",strlen(html));
    printf("%s\n",html);
  }
  delete(html);
  delete(urls);
  fclose( f );  
  exit(0);
}