/* * Curl callback function for processing remote files * * name: curlCallback * @param void*, size_t, size_t, void* * @return size_t */ static size_t curlCallback(void *contents, size_t size, size_t nmemb, void *data) { size_t realsize = size * nmemb; struct filebuffer *fb = (struct filebuffer*) data; char *smt = stripMarkupTags((char*)contents); char *sna = stripNonAlpha(smt); size_t sna_size = strlen(sna); free(smt); if (! (fb->fbuffer = (char*) realloc(fb->fbuffer, fb->fbsize + sna_size + 1)) ) { fprintf(stderr, "Can't allocate memory for buffer. Returning 0 bytes.\n"); return 0; } strncpy(&(fb->fbuffer[fb->fbsize]), sna, sna_size); fb->fbsize += sna_size; fb->fbuffer[fb->fbsize] = '\0'; free(sna); return realsize; }
int stats3_compress_bits(range_coder *c,unsigned char *m) { unsigned char alpha[1024]; // message with all non alpha/spaces removed unsigned char lcalpha[1024]; // message with all alpha chars folded to lower-case /* Use model instead of just packed ASCII */ range_encode_equiprobable(c,2,1); // not raw ASCII range_encode_symbol(c,&probPackedASCII,2,1); // not packed ASCII // printf("%f bits to encode model\n",c->entropy); total_model_bits+=c->entropy; double lastEntropy=c->entropy; /* Encode length of message */ encodeLength(c,strlen((char *)m)); // printf("%f bits to encode length\n",c->entropy-lastEntropy); total_length_bits+=c->entropy-lastEntropy; lastEntropy=c->entropy; /* encode any non-ASCII characters */ encodeNonAlpha(c,m); stripNonAlpha(m,alpha); int nonAlphaChars=strlen(m)-strlen(alpha); // printf("%f bits (%d emitted) to encode non-alpha\n",c->entropy-lastEntropy,c->bits_used); total_nonalpha_bits+=c->entropy-lastEntropy; lastEntropy=c->entropy; /* compress lower-caseified version of message */ stripCase(alpha,lcalpha); encodeLCAlphaSpace(c,lcalpha); // printf("%f bits (%d emitted) to encode chars\n",c->entropy-lastEntropy,c->bits_used); total_alpha_bits+=c->entropy-lastEntropy; lastEntropy=c->entropy; /* case must be encoded after symbols, so we know how many letters and where word breaks are. */ mungeCase((char *)alpha); encodeCaseModel1(c,alpha); // printf("%f bits (%d emitted) to encode case\n",c->entropy-lastEntropy,c->bits_used); total_case_bits+=c->entropy-lastEntropy; range_conclude(c); // printf("%d bits actually used after concluding.\n",c->bits_used); total_finalisation_bits+=c->bits_used-c->entropy; if ((!nonAlphaChars)&&c->bits_used>=7*strlen((char *)m)) { /* Can we code it more efficiently without statistical modelling? */ range_coder *c2=range_new_coder(1024); range_encode_equiprobable(c2,2,1); // not raw ASCII range_encode_symbol(c2,&probPackedASCII,2,0); // is packed ASCII encodeLength(c2,strlen((char *)m)); encodePackedASCII(c2,(char *)m); range_conclude(c2); if (c2->bits_used<c->bits_used) { range_coder_reset(c); range_encode_equiprobable(c,2,1); // not raw ASCII range_encode_symbol(c,&probPackedASCII,2,0); // is packed ASCII encodeLength(c,strlen((char *)m)); encodePackedASCII(c,(char *)m); range_conclude(c); // printf("Reverting to raw non-statistical encoding: %d chars in %d bits\n", // (int)strlen((char *)m),c->bits_used); } range_coder_free(c2); } if ((c->bits_used>=8*strlen((char*)m)) &&(!(m[0]&0x80))) { /* we can't encode it more efficiently than 8-bit raw. We can only do this is MSB of first char of message is 0, as we use the first bit of the message to indicate if it is compressed or not. */ int i; range_coder_reset(c); for(i=0;m[i];i++) c->bit_stream[i]=m[i]; c->bits_used=8*i; c->entropy=8*i; // printf("Reverting to raw 8-bit encoding: used %d bits\n",c->bits_used); } return 0; }
/* * name: proccessFile * @param string * @return pointer to struct filebuffer */ struct filebuffer *proccessFile(const char *pFile) { struct filebuffer *fb = NULL; FILE *srchFile; #if defined(HAVE_CURL_CURL_H) && (defined(HAVE_LIBCURL) || defined(HAVE_LIBCURLDLL)) CURL *webRes; if (strncasecmp(pFile, "https:", 6) == 0 || strncasecmp(pFile, "http:", 5) == 0) { fb = (struct filebuffer*) malloc(sizeof(struct filebuffer)); fb->fbuffer = (char*) malloc(1); fb->fbsize = 0; curl_global_init(CURL_GLOBAL_ALL); webRes = curl_easy_init(); curl_easy_setopt(webRes, CURLOPT_URL, pFile); curl_easy_setopt(webRes, CURLOPT_WRITEFUNCTION, curlCallback); curl_easy_setopt(webRes, CURLOPT_WRITEDATA, (void*) fb); curl_easy_setopt(webRes, CURLOPT_USERAGENT, "libcurl-agent/1.0"); curl_easy_perform(webRes); curl_easy_cleanup(webRes); curl_global_cleanup(); } else #endif if ((srchFile = fopen(pFile, "r"))) { fb = (struct filebuffer*) malloc(sizeof(struct filebuffer)); char *tmpbuff = NULL; fseek(srchFile, 0, SEEK_END); fb->fbsize = ftell(srchFile); rewind(srchFile); if (! (tmpbuff = (char*) malloc(sizeof(char) * fb->fbsize)) ) { fprintf(stderr, "Can't allocate memory for temporary buffer for file %s.\n", pFile); return NULL; } if (fread(tmpbuff, sizeof(char), fb->fbsize, srchFile) != fb->fbsize) { fprintf(stderr, "Can't read file %s.\n", pFile); return NULL; } fclose(srchFile); char *smt = stripMarkupTags(tmpbuff); char *sna = stripNonAlpha(smt); size_t sna_size = strlen(sna); free(smt); free(tmpbuff); if (! (fb->fbuffer = (char*) malloc(sizeof(char) * sna_size)) ) { fprintf(stderr, "Can't allocate memory for buffer.\n"); return NULL; } strncpy(fb->fbuffer, sna, sna_size); fb->fbsize = sna_size; free(sna); } return fb; }