Beispiel #1
0
/*
 * Curl callback function for processing remote files
 *
 * name: curlCallback
 * @param void*, size_t, size_t, void*
 * @return size_t
 */
static size_t curlCallback(void *contents, size_t size, size_t nmemb, void *data) {
	size_t realsize = size * nmemb;
	struct filebuffer *fb = (struct filebuffer*) data;

	char *smt = stripMarkupTags((char*)contents);
	char *sna = stripNonAlpha(smt);
	size_t sna_size = strlen(sna);
	free(smt);

	if (! (fb->fbuffer = (char*) realloc(fb->fbuffer, fb->fbsize + sna_size + 1)) ) {
		fprintf(stderr, "Can't allocate memory for buffer. Returning 0 bytes.\n");
		return 0;
	}

	strncpy(&(fb->fbuffer[fb->fbsize]), sna, sna_size);
	fb->fbsize += sna_size;
	fb->fbuffer[fb->fbsize] = '\0';
	free(sna);

	return realsize;
}
Beispiel #2
0
int stats3_compress_bits(range_coder *c,unsigned char *m)
{
  unsigned char alpha[1024]; // message with all non alpha/spaces removed
  unsigned char lcalpha[1024]; // message with all alpha chars folded to lower-case

  /* Use model instead of just packed ASCII */
  range_encode_equiprobable(c,2,1); // not raw ASCII
  range_encode_symbol(c,&probPackedASCII,2,1); // not packed ASCII

  // printf("%f bits to encode model\n",c->entropy);
  total_model_bits+=c->entropy;
  double lastEntropy=c->entropy;
  
  /* Encode length of message */
  encodeLength(c,strlen((char *)m));
  
  // printf("%f bits to encode length\n",c->entropy-lastEntropy);
  total_length_bits+=c->entropy-lastEntropy;
  lastEntropy=c->entropy;

  /* encode any non-ASCII characters */
  encodeNonAlpha(c,m);
  stripNonAlpha(m,alpha);
  int nonAlphaChars=strlen(m)-strlen(alpha);

  //  printf("%f bits (%d emitted) to encode non-alpha\n",c->entropy-lastEntropy,c->bits_used);
  total_nonalpha_bits+=c->entropy-lastEntropy;

  lastEntropy=c->entropy;

  /* compress lower-caseified version of message */
  stripCase(alpha,lcalpha);
  encodeLCAlphaSpace(c,lcalpha);

  // printf("%f bits (%d emitted) to encode chars\n",c->entropy-lastEntropy,c->bits_used);
  total_alpha_bits+=c->entropy-lastEntropy;

  lastEntropy=c->entropy;
  
  /* case must be encoded after symbols, so we know how many
     letters and where word breaks are.
 */
  mungeCase((char *)alpha);
  encodeCaseModel1(c,alpha);
  
  //  printf("%f bits (%d emitted) to encode case\n",c->entropy-lastEntropy,c->bits_used);
  total_case_bits+=c->entropy-lastEntropy;

  range_conclude(c);
  // printf("%d bits actually used after concluding.\n",c->bits_used);
  total_finalisation_bits+=c->bits_used-c->entropy;

  if ((!nonAlphaChars)&&c->bits_used>=7*strlen((char *)m))
    {
      /* Can we code it more efficiently without statistical modelling? */
      range_coder *c2=range_new_coder(1024);
      range_encode_equiprobable(c2,2,1); // not raw ASCII
      range_encode_symbol(c2,&probPackedASCII,2,0); // is packed ASCII
      encodeLength(c2,strlen((char *)m));
      encodePackedASCII(c2,(char *)m);
      range_conclude(c2);
      if (c2->bits_used<c->bits_used) {
	range_coder_reset(c);
	range_encode_equiprobable(c,2,1); // not raw ASCII
	range_encode_symbol(c,&probPackedASCII,2,0); // is packed ASCII
	encodeLength(c,strlen((char *)m));
	encodePackedASCII(c,(char *)m);
	range_conclude(c);
	// printf("Reverting to raw non-statistical encoding: %d chars in %d bits\n",
	//        (int)strlen((char *)m),c->bits_used);
      }
      range_coder_free(c2);
    }
  
  if ((c->bits_used>=8*strlen((char*)m))
      &&(!(m[0]&0x80)))
    {
      /* we can't encode it more efficiently than 8-bit raw.
         We can only do this is MSB of first char of message is 0, as we use
	 the first bit of the message to indicate if it is compressed or not. */
      int i;
      range_coder_reset(c);
      for(i=0;m[i];i++) c->bit_stream[i]=m[i];
      c->bits_used=8*i;
      c->entropy=8*i;

      // printf("Reverting to raw 8-bit encoding: used %d bits\n",c->bits_used);
    }

  return 0;
}
Beispiel #3
0
/*
 * name: proccessFile
 * @param string
 * @return pointer to struct filebuffer
 */
struct filebuffer *proccessFile(const char *pFile) {
	struct filebuffer *fb = NULL;
	FILE *srchFile;
#if defined(HAVE_CURL_CURL_H) && (defined(HAVE_LIBCURL) || defined(HAVE_LIBCURLDLL))
	CURL *webRes;
	if (strncasecmp(pFile, "https:", 6) == 0 ||
	    strncasecmp(pFile, "http:", 5) == 0) {
		fb = (struct filebuffer*) malloc(sizeof(struct filebuffer));
		fb->fbuffer = (char*) malloc(1);
		fb->fbsize = 0;
		curl_global_init(CURL_GLOBAL_ALL);
		webRes = curl_easy_init();
		curl_easy_setopt(webRes, CURLOPT_URL, pFile);
		curl_easy_setopt(webRes, CURLOPT_WRITEFUNCTION, curlCallback);
		curl_easy_setopt(webRes, CURLOPT_WRITEDATA, (void*) fb);
		curl_easy_setopt(webRes, CURLOPT_USERAGENT, "libcurl-agent/1.0");
		curl_easy_perform(webRes);
		curl_easy_cleanup(webRes);
		curl_global_cleanup();
	} else
#endif
	if ((srchFile = fopen(pFile, "r"))) {
		fb = (struct filebuffer*) malloc(sizeof(struct filebuffer));
		char *tmpbuff = NULL;

		fseek(srchFile, 0, SEEK_END);
	  	fb->fbsize = ftell(srchFile);
	  	rewind(srchFile);

		if (! (tmpbuff = (char*) malloc(sizeof(char) * fb->fbsize)) ) {
			fprintf(stderr, "Can't allocate memory for temporary buffer for file %s.\n", pFile);
			return NULL;
		}

	  	if (fread(tmpbuff, sizeof(char), fb->fbsize, srchFile) != fb->fbsize) {
			fprintf(stderr, "Can't read file %s.\n", pFile);
			return NULL;
		}

		fclose(srchFile);

		char *smt = stripMarkupTags(tmpbuff);
		char *sna = stripNonAlpha(smt);
		size_t sna_size = strlen(sna);

		free(smt);
		free(tmpbuff);

		if (! (fb->fbuffer = (char*) malloc(sizeof(char) * sna_size)) ) {
			fprintf(stderr, "Can't allocate memory for buffer.\n");
			return NULL;
		}

	  	strncpy(fb->fbuffer, sna, sna_size);
	  	fb->fbsize = sna_size;

	  	free(sna);
	}

  	return fb;
}