示例#1
0
文件: page_db.c 项目: plafl/aduana
/** Serialize the PageInfo into a contiguos block of memory.
 *
 * Note that enough new memory will be allocated inside val.mv_data to contain
 * the results of the dump. This memory should be freed when no longer is
 * necessary (for example after an mdb_cursor_put).
 *
 * @param pi The PageInfo to be serialized
 * @param val The destination of the serialization. Should have no memory
 *            allocated inside mv_data since new memory will be allocated.
 *
 * @return 0 if success, -1 if failure.
 */
static int
page_info_dump(const PageInfo *pi, MDB_val *val) {
     /* To save space we apply the following 'compression' method:
        1. If n_crawls > 1 all data is saved
        2. If n_crawls = 1 we have the following constraints:
               last_crawl = first_crawl
               n_changes  = 0
           And so we don't bother to store last_crawl and n_changes
        3. If n_crawls = 0, which is the very common case of an uncrawled page:
               first_crawl         = 0
               last_crawl          = 0
               n_changes           = 0
               content_hash_length = 0
               content_hash        = NULL
     */

     // TODO: store the following fields using varint compression:
     //    - n_crawls
     //    - n_changes
     //    - content_hash_length
     //    - depth

     // necessary size except for the URL, which we don't know yet how much is going
     // to cost storing
     val->mv_size = sizeof(pi->linked_from) +
          sizeof(pi->score) + sizeof(pi->n_crawls) + sizeof(pi->depth);
     if (pi->n_crawls > 0) {
          val->mv_size += sizeof(pi->first_crawl) +
               pi->content_hash_length + sizeof(pi->content_hash_length);
          if (pi->n_crawls > 1)
               val->mv_size += sizeof(pi->last_crawl) + sizeof(pi->n_changes);
     }

     size_t url_size = strlen(pi->url);
     char *data = val->mv_data = malloc(val->mv_size + 4*url_size);
     if (!data)
          return -1;

     size_t curl_size = (size_t)smaz_compress(
          pi->url, url_size, data + sizeof(unsigned short), 4*url_size);
     if (curl_size > 4*url_size) // TODO
          return -1;
     ((unsigned short*)data)[0] = curl_size;
     val->mv_size += sizeof(unsigned short) + curl_size;


     size_t i = sizeof(unsigned short) + curl_size;
     size_t j;
     char * s;
#define PAGE_INFO_WRITE(x) for (j=0, s=(char*)&(x); j<sizeof(x); data[i++] = s[j++])
     PAGE_INFO_WRITE(pi->score);
     PAGE_INFO_WRITE(pi->linked_from);
     PAGE_INFO_WRITE(pi->depth);
     PAGE_INFO_WRITE(pi->n_crawls);
     if (pi->n_crawls > 0) {
          PAGE_INFO_WRITE(pi->first_crawl);
          if (pi->n_crawls > 1) {
               PAGE_INFO_WRITE(pi->last_crawl);
               PAGE_INFO_WRITE(pi->n_changes);
          }
          PAGE_INFO_WRITE(pi->content_hash_length);
          for (j=0; j<pi->content_hash_length; data[i++] = pi->content_hash[j++]);
     }

     return 0;
}
void PHashTranslation::generate(const Ref<Translation> &p_from) {
#ifdef TOOLS_ENABLED
	List<StringName> keys;
	p_from->get_message_list(&keys);

	int size=Math::larger_prime(keys.size());


	print_line("compressing keys: "+itos(keys.size()));
	Vector< Vector< Pair<int,CharString> > > buckets;
	Vector< Map< uint32_t, int > > table;
	Vector< uint32_t > hfunc_table;
	Vector< _PHashTranslationCmp > compressed;

	table.resize(size);
	hfunc_table.resize(size);
	buckets.resize(size);
	compressed.resize(keys.size());

	int idx=0;
	int total_compression_size=0;
	int total_string_size=0;

	for(List<StringName>::Element *E=keys.front();E;E=E->next()) {

		//hash string
		CharString cs = E->get().operator String().utf8();
		uint32_t h = hash(0,cs.get_data());
		Pair<int,CharString> p;
		p.first=idx;
		p.second=cs;
		buckets[h % size].push_back(p);

		//compress string
		CharString src_s = p_from->get_message(E->get()).operator String().utf8();
		_PHashTranslationCmp ps;
		ps.orig_len=src_s.size();
		ps.offset=total_compression_size;

		if (ps.orig_len!=0) {
			CharString dst_s;
			dst_s.resize(src_s.size());
			int ret = smaz_compress(src_s.get_data(),src_s.size(),&dst_s[0],src_s.size());
			if (ret>=src_s.size()) {
				//if compressed is larger than original, just use original
				ps.orig_len=src_s.size();
				ps.compressed=src_s;
			} else {
				dst_s.resize(ret);
				//ps.orig_len=;
				ps.compressed=dst_s;
			}
		} else {
			ps.orig_len=1;
			ps.compressed.resize(1);
			ps.compressed[0]=0;
		}


		compressed[idx]=ps;
		total_compression_size+=ps.compressed.size();
		total_string_size+=src_s.size();
		idx++;
	}

	int bucket_table_size=0;
	print_line("total compressed string size: "+itos(total_compression_size)+" ("+itos(total_string_size)+" uncompressed).");

	for(int i=0;i<size;i++) {

		Vector< Pair<int,CharString> > &b = buckets[i];
		Map< uint32_t, int > &t=table[i];

		if (b.size()==0)
			continue;

		//print_line("bucket: "+itos(i)+" - elements: "+itos(b.size()));

		int d = 1;
		int item =0;

		while(item < b.size()) {

			uint32_t slot = hash(d,b[item].second.get_data());
			if (t.has(slot)) {

				item=0;
				d++;
				t.clear();
			} else {
				t[slot]=b[item].first;
				item++;
			}
		}

		hfunc_table[i]=d;
		bucket_table_size+=2+b.size()*4;

	}


	print_line("bucket table size: "+itos(bucket_table_size*4));
	print_line("hash table size: "+itos(size*4));

	hash_table.resize(size);
	bucket_table.resize(bucket_table_size);

	DVector<int>::Write htwb = hash_table.write();
	DVector<int>::Write btwb = bucket_table.write();

	uint32_t *htw = (uint32_t*)&htwb[0];
	uint32_t *btw = (uint32_t*)&btwb[0];

	int btindex=0;
	int collisions=0;

	for(int i=0;i<size;i++) {

		Map< uint32_t, int > &t=table[i];
		if (t.size()==0) {
			htw[i]=0xFFFFFFFF; //nothing
			continue;
		} else if (t.size()>1) {
			collisions+=t.size()-1;
		}

		htw[i]=btindex;
		btw[btindex++]=t.size();
		btw[btindex++]=hfunc_table[i];

		for( Map< uint32_t, int >::Element *E=t.front();E;E=E->next()) {

			btw[btindex++]=E->key();
			btw[btindex++]=compressed[E->get()].offset;
			btw[btindex++]=compressed[E->get()].compressed.size();
			btw[btindex++]=compressed[E->get()].orig_len;
		}

	}

	print_line("total collisions: "+itos(collisions));

	strings.resize(total_compression_size);
	DVector<uint8_t>::Write cw = strings.write();

	for(int i=0;i<compressed.size();i++) {
		memcpy(&cw[compressed[i].offset],compressed[i].compressed.get_data(),compressed[i].compressed.size());
	}


	ERR_FAIL_COND(btindex!=bucket_table_size);
	set_locale(p_from->get_locale());

#endif
}
示例#3
0
文件: smaz_test.c 项目: mojobojo/smaz
int main(void) {
    char in[512];
    char out[4096];
    char d[4096];
    int comprlen, decomprlen;
    int j, ranlen;
    int times = 1000000;
    char *strings[] = {
        "This is a small string",
        "foobar",
        "the end",
        "not-a-g00d-Exampl333",
        "Smaz is a simple compression library",
        "Nothing is more difficult, and therefore more precious, than to be able to decide",
        "this is an example of what works very well with smaz",
        "1000 numbers 2000 will 10 20 30 compress very little",
        "and now a few italian sentences:",
        "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura",
        "Mi illumino di immenso",
        "L'autore di questa libreria vive in Sicilia",
        "try it against urls",
        "http://google.com",
        "http://programming.reddit.com",
        "http://github.com/antirez/smaz/tree/master",
        "/media/hdb1/music/Alben/The Bla",
        NULL
    };

    j=0;
    while(strings[j]) {
        int comprlevel;

        comprlen = smaz_compress(strings[j],strlen(strings[j]),out,sizeof(out));
        comprlevel = 100-((100*comprlen)/strlen(strings[j]));
        decomprlen = smaz_decompress(out,comprlen,d,sizeof(d));
        if (strlen(strings[j]) != (unsigned)decomprlen ||
            memcmp(strings[j],d,decomprlen))
        {
            printf("BUG: error compressing '%s'\n", strings[j]);
            exit(1);
        }
        if (comprlevel < 0) {
            printf("'%s' enlarged by %d%%\n",strings[j],-comprlevel);
        } else {
            printf("'%s' compressed by %d%%\n",strings[j],comprlevel);
        }
        j++;
    }
    printf("Encrypting and decrypting %d test strings...\n", times);
    while(times--) {
        char charset[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvxyz/. ";
        ranlen = random() % 512;

        for (j = 0; j < ranlen; j++) {
            if (times & 1)
                in[j] = charset[random() % (sizeof(charset)-1)];
            else
                in[j] = (char)(random() & 0xff);
        }
        comprlen = smaz_compress(in,ranlen,out,sizeof(out));
        decomprlen = smaz_decompress(out,comprlen,d,sizeof(out));

        if (ranlen != decomprlen || memcmp(in,d,ranlen)) {
            printf("Bug! TEST NOT PASSED\n");
            exit(1);
        }
        /* printf("%d -> %d\n", comprlen, decomprlen); */
    }
    printf("TEST PASSED :)\n");
    return 0;
}