void fs_hash_init(fsp_hash_enum type) { switch (type) { case FS_HASH_MD5: case FS_HASH_CRC64: fs_error(LOG_CRIT, "Unsuported backend hash function, exiting"); exit(4); break; case FS_HASH_UMAC: break; case FS_HASH_UNKNOWN: fs_error(LOG_CRIT, "Unknown backend hash function, exiting"); exit(4); break; } bnids = g_hash_table_new_full(g_str_hash, g_str_equal, bnhash_destroy, NULL); atexit(fs_hash_fini); fs_c.empty = 0LL; fs_c.xsd_string = fs_hash_uri(XSD_STRING); fs_c.xsd_integer = fs_hash_uri(XSD_INTEGER); fs_c.xsd_float = fs_hash_uri(XSD_FLOAT); fs_c.xsd_double = fs_hash_uri(XSD_DOUBLE); fs_c.xsd_decimal = fs_hash_uri(XSD_DECIMAL); fs_c.xsd_boolean = fs_hash_uri(XSD_BOOLEAN); fs_c.xsd_datetime = fs_hash_uri(XSD_DATETIME); fs_c.xsd_date = fs_hash_uri(XSD_DATE); fs_c.xsd_pinteger = fs_hash_uri(XSD_NAMESPACE "positiveInteger"); fs_c.xsd_ninteger = fs_hash_uri(XSD_NAMESPACE "negativeInteger"); fs_c.xsd_npinteger = fs_hash_uri(XSD_NAMESPACE "nonPositiveInteger"); fs_c.xsd_nninteger = fs_hash_uri(XSD_NAMESPACE "nonNegativeInteger"); fs_c.xsd_long = fs_hash_uri(XSD_NAMESPACE "long"); fs_c.xsd_int = fs_hash_uri(XSD_NAMESPACE "int"); fs_c.xsd_short = fs_hash_uri(XSD_NAMESPACE "short"); fs_c.xsd_byte = fs_hash_uri(XSD_NAMESPACE "byte"); fs_c.xsd_ulong = fs_hash_uri(XSD_NAMESPACE "unsignedLong"); fs_c.xsd_uint = fs_hash_uri(XSD_NAMESPACE "unsignedInt"); fs_c.xsd_ushort = fs_hash_uri(XSD_NAMESPACE "unsignedShort"); fs_c.xsd_ubyte = fs_hash_uri(XSD_NAMESPACE "unsignedByte"); fs_c.lang_en = fs_hash_literal("en", 0); fs_c.lang_fr = fs_hash_literal("fr", 0); fs_c.lang_de = fs_hash_literal("de", 0); fs_c.lang_es = fs_hash_literal("es", 0); fs_c.rdf_type = fs_hash_uri(RDF_NAMESPACE "type"); fs_c.default_graph = fs_hash_uri(FS_DEFAULT_GRAPH); fs_c.system_config = fs_hash_uri(FS_SYSTEM_CONFIG); fs_c.rdfs_label = fs_hash_uri(RDFS_LABEL); fs_c.fs_text_index = fs_hash_uri(FS_TEXT_INDEX); fs_c.fs_token = fs_hash_uri(FS_TEXT_TOKEN); fs_c.fs_dmetaphone = fs_hash_uri(FS_TEXT_DMETAPHONE); fs_c.fs_stem = fs_hash_uri(FS_TEXT_STEM); fs_c.fs_acl_admin = fs_hash_uri(FS_ACL_ADMIN); fs_c.fs_acl_access_by = fs_hash_uri(FS_ACL_ONLY_ACCESS_BY); fs_c.fs_acl_default_admin = fs_hash_literal(FS_ACL_DEFAULT_ADMIN,0); }
static fs_rid insert_plain(xmlctxt *ctxt) { char *text = ctxt->resource; if (!text) { text = ""; /* this case is actually the empty string */ } fs_rid lang = fs_c.empty; if (ctxt->attr) { lang = fs_hash_literal(ctxt->attr, fs_c.empty); insert_resource(ctxt, lang, fs_c.empty, ctxt->attr); } fs_rid r = fs_hash_literal(text, lang); insert_resource(ctxt, r, lang, text); return r; }
fs_value fs_value_plain_with_lang(const char *s, const char *l) { fs_value v = fs_value_blank(); if (!l || *l == '\0') { v.attr = fs_c.empty; } else { v.attr = fs_hash_literal(l, 0); } v.lex = (char *)s; return v; }
int main(int argc, char *argv[]) { if (argc != 2) { fprintf(stderr, "%s revision %s\n", argv[0], FS_BACKEND_VER); fprintf(stderr, "Usage: %s <uri> | \"literal\"\n", argv[0]); exit(1); } char *string = argv[1]; char lex[128], lang[128], type[128], uri[128]; fs_rid rid; #ifdef FS_MD5 fs_hash_init(FS_HASH_MD5); #endif #ifdef FS_CRC64 fs_hash_init(FS_HASH_CRC64); #endif #ifdef FS_UMAC fs_hash_init(FS_HASH_UMAC); #endif if (sscanf(string, "\"%127[^\"]\"@%127s", lex, lang) == 2) { rid = fs_hash_literal(lex,fs_hash_literal(lang, 0)); } else if (sscanf(string, "\"%127[^\"]\"^^%127s", lex, type) == 2) { rid = fs_hash_literal(lex,fs_hash_uri(type)); } else if (sscanf(string, "\"%127[^\"]\"", lex) == 1) { rid = fs_hash_literal(lex, 0); } else if (sscanf(string, "<%127[^>]>", uri) == 1) { rid = fs_hash_uri(uri); } else { fprintf(stderr, "Couldn't recognise a URI or literal in string '%s'\n", string); exit(1); } printf("%016llX\n", rid); }
static fs_rid insert_typed(xmlctxt *ctxt) { char *text = ctxt->resource; fs_rid dt = fs_c.empty; if (ctxt->attr) { dt = fs_hash_uri(ctxt->attr); insert_resource(ctxt, dt, fs_c.empty, ctxt->attr); } else { fs_error(LOG_ERR, "NULL type URI inserted"); } fs_rid r = fs_hash_literal(text, dt); insert_resource(ctxt, r, dt, text); return r; }
fs_value fs_value_fill_rid(fs_query *q, fs_value a) { if (a.valid & fs_valid_bit(FS_V_RID)) { return a; } if (a.valid & fs_valid_bit(FS_V_TYPE_ERROR)) { a.rid = FS_RID_NULL; } fs_value_fill_lexical(q, a); a.rid = fs_hash_literal(a.lex, a.attr); a.valid |= fs_valid_bit(FS_V_RID); return a; }
fs_value fn_cast(fs_query *q, fs_value v, fs_value d) { #if 0 printf("CAST "); fs_value_print(v); printf(" -> "); fs_value_print(d); printf("\n"); #endif if (FS_IS_URI(d.rid) && FS_IS_LITERAL(v.rid)) { return fn_cast_intl(q, v, d.rid); } if (d.rid == fs_c.xsd_string && FS_IS_URI(v.rid)) { fs_value v2 = fn_cast_intl(q, v, d.rid); v2.rid = fs_hash_literal(v.lex, d.rid); return v2; } return fs_value_error(FS_ERROR_INVALID_TYPE, "cast on URI/bNode"); }
void fs_resource_from_rasqal_literal(struct update_context *uctxt, rasqal_literal *l, fs_resource *res, int row) { if (!l) { res->lex = "(null)"; res->attr = FS_RID_NULL; return; } rasqal_literal_type type = l->type; if (type == RASQAL_LITERAL_VARIABLE) { /* right now you can't introduce new literals in INSERT, so it doesn't * matter */ res->lex = NULL; res->attr = FS_RID_GONE; } else if (type == RASQAL_LITERAL_URI) { res->lex = (char *)raptor_uri_as_string(l->value.uri); res->attr = FS_RID_NULL; } else { res->lex = (char *)l->string; res->attr = 0; fs_resource ares; ares.lex = NULL; if (l->datatype) { res->attr = fs_hash_uri((char *)raptor_uri_as_string(l->datatype)); ares.rid = res->attr; ares.lex = (char *)raptor_uri_as_string(l->datatype); ares.attr = FS_RID_NULL; } else if (l->language) { res->attr = fs_hash_literal(l->language, 0); ares.rid = res->attr; ares.lex = (char *)l->language; ares.attr = 0; } /* insert attribute resource if there is one */ if (ares.lex) { fsp_res_import(uctxt->link, FS_RID_SEGMENT(ares.rid, uctxt->segments), 1, &ares); } } }
fs_rid fs_hash_rasqal_literal(struct update_context *uc, rasqal_literal *l, int row) { if (!l) return FS_RID_NULL; if (l->type == RASQAL_LITERAL_VARIABLE) { if (uc->q) { return fs_binding_get_val(uc->q->bb[0], l->value.variable, row, NULL); } fs_error(LOG_ERR, "no variables bound"); return FS_RID_NULL; } rasqal_literal_type type = rasqal_literal_get_rdf_term_type(l); switch (type) { case RASQAL_LITERAL_URI: return fs_hash_uri((char *)raptor_uri_as_string(l->value.uri)); case RASQAL_LITERAL_UNKNOWN: case RASQAL_LITERAL_STRING: case RASQAL_LITERAL_XSD_STRING: { fs_rid attr = 0; if (l->datatype) { attr = fs_hash_uri((char *)raptor_uri_as_string(l->datatype)); } else if (l->language) { /* lang tags are normalised to upper case internally */ char *lang = g_ascii_strup((char *)l->language, -1); attr = fs_hash_literal(lang, 0); g_free(lang); } return fs_hash_literal((char *)rasqal_literal_as_string(l), attr); } case RASQAL_LITERAL_BLANK: { raptor_term_blank_value bnode; bnode.string = (unsigned char *)rasqal_literal_as_string(l); bnode.string_len = strlen((char *)bnode.string); return fs_bnode_id(uc->link, bnode); } case RASQAL_LITERAL_VARIABLE: case RASQAL_LITERAL_QNAME: case RASQAL_LITERAL_PATTERN: case RASQAL_LITERAL_BOOLEAN: case RASQAL_LITERAL_INTEGER: case RASQAL_LITERAL_INTEGER_SUBTYPE: case RASQAL_LITERAL_DECIMAL: case RASQAL_LITERAL_FLOAT: case RASQAL_LITERAL_DOUBLE: case RASQAL_LITERAL_DATETIME: case RASQAL_LITERAL_UDT: #if RASQAL_VERSION >= 929 case RASQAL_LITERAL_DATE: #endif break; } fs_error(LOG_ERR, "bad rasqal literal (type %d)", type); return FS_RID_NULL; }
static void rdf_parser_statement_handler(void* user_data, const raptor_statement* st) { raptor_term *g, *s, *p, *o; g = st->graph; s = st->subject; p = st->predicate; o = st->object; rdf_parser_internal *parser_obj = (rdf_parser_internal *) user_data; if (parser_obj->counter == 0) parser_obj->partial_parse_time = g_timer_new(); parser_obj->counter++; /* init index logic */ unsigned char *gc = NULL; fs_rid g_rid; if (parser_obj->trig) { gc = raptor_uri_as_string(g->value.uri); g_rid = fs_hash_uri((const char *)gc); } else { g_rid = parser_obj->g_rid ; gc = parser_obj->model; } unsigned char *sc = NULL; if (s->type == RAPTOR_TERM_TYPE_URI) sc = raptor_uri_as_string(s->value.uri); else { sc = (unsigned char *) g_strdup_printf("bnode:b%s%s",s->value.blank.string+5,parser_obj->bnode_ts); } unsigned char *pc = raptor_uri_as_string(p->value.uri); unsigned char *oc = NULL; unsigned char *o_lang = NULL; unsigned char *o_datatype = NULL; fs_rid s_rid = fs_hash_uri((const char *) sc); fs_rid p_rid = fs_hash_uri((const char *) pc); fs_rid o_rid = 0x0; if (o->type == RAPTOR_TERM_TYPE_URI) { oc = raptor_uri_as_string(o->value.uri); o_rid = fs_hash_uri((const char *) oc); } else if (o->type == RAPTOR_TERM_TYPE_LITERAL) { oc = o->value.literal.string; if (o->value.literal.datatype) { o_datatype = raptor_uri_as_string(o->value.literal.datatype); o_rid = fs_hash_literal((const char *) oc,fs_hash_uri((const char *) o_datatype)); } else if (o->value.literal.language != NULL) { o_lang = o->value.literal.language; o_rid = fs_hash_literal((const char *) oc, fs_hash_uri((const char *) o_lang)); } else { o_rid = fs_hash_literal((const char *) oc, FS_RID_NULL); } oc = raptor_term_to_string(o); } else if (o->type == RAPTOR_TERM_TYPE_BLANK) { oc = (unsigned char *) g_strdup_printf("bnode:b%s%s",o->value.blank.string+5,parser_obj->bnode_ts); o_rid = fs_hash_uri((const char *) oc); } int seg_id = s_rid % SEGMENTS; fs_rid *quad = rdf_parser_new_quad(g_rid,s_rid,p_rid,o_rid); g_ptr_array_add(parser_obj->quads[seg_id],quad); /* saves hashes into disk hash TODO looks for optimistions */ rdf_kb *kb = parser_obj->kb; fs_rid hashes[4] = {g_rid,s_rid,p_rid,o_rid}; //log_debug("%llx %llx %llx %llx", hashes[0], hashes[1], hashes[2], hashes[3]); //char tmp_rid[16+1]; char tmp_rid[17]; unsigned char *strings[4] = {gc,sc,pc,oc}; int assigned_hash=0; for(int i=0;i<4;i++) { memset(tmp_rid,0,16); sprintf(tmp_rid,"%llx",hashes[i]); assigned_hash = hashes[i] % HASHES_NUM; //if (i > 0 && strlen(tmp_rid) < 15 ) //printf("ERRRRO NUL %s %s\n", tmp_rid, strings[i]); kcdbset(kb->hash_stores[assigned_hash],(const char *) tmp_rid, 16 ,(const char *) strings[i], strlen((const char *) strings[i])); } /* end of saving hashes into disk */ if (!(parser_obj->counter % STAT_BATCH)) { double kt = parser_obj->counter/1e3; log_debug("parsing progress %.0lf kT %.2lf kT/s %.2lf kT/s",kt,kt/g_timer_elapsed(parser_obj->global_parse_time,NULL), (STAT_BATCH/1e3)/g_timer_elapsed(parser_obj->partial_parse_time,NULL)); g_timer_start(parser_obj->partial_parse_time); } }
static void store_stmt(void *user_data, const raptor_statement * statement) { fs_parse_stuff *data = (fs_parse_stuff *) user_data; char *subj = (char *) raptor_uri_as_string((raptor_uri *) statement->subject); char *pred; char *obj; fs_rid m, s, p, o; char tmpp[512]; m = data->model_hash; if (statement->subject_type == RAPTOR_IDENTIFIER_TYPE_ANONYMOUS) { s = fs_bnode_id(data->link, statement->subject); subj = (char *) statement->subject; } else { s = fs_hash_uri(subj); } if (statement->predicate_type == RAPTOR_IDENTIFIER_TYPE_ORDINAL) { sprintf(tmpp, MEMBER_PREFIX "%d", *((int *)statement->predicate)); pred = tmpp; } else { pred = (char *) raptor_uri_as_string((raptor_uri *) statement->predicate); } p = fs_hash_uri(pred); fs_rid attr = fs_c.empty; if (statement->object_type == RAPTOR_IDENTIFIER_TYPE_LITERAL || statement->object_type == RAPTOR_IDENTIFIER_TYPE_XML_LITERAL) { obj = (char *) statement->object; if (statement->object_literal_language) { char *langtag = (char *)statement->object_literal_language; for (char *pos = langtag; *pos; pos++) { if (islower(*pos)) { *pos = toupper(*pos); } } attr = fs_hash_literal(langtag, 0); buffer_res(data->link, data->segments, attr, langtag, fs_c.empty, data->dryrun); } else if (raptor_uri_as_string(statement->object_literal_datatype)) { char *dt = (char *)raptor_uri_as_string(statement->object_literal_datatype); attr = fs_hash_uri(dt); buffer_res(data->link, data->segments, attr, dt, FS_RID_NULL, data->dryrun); } o = fs_hash_literal(obj, attr); } else if (statement->object_type == RAPTOR_IDENTIFIER_TYPE_ANONYMOUS) { o = fs_bnode_id(data->link, statement->object); obj = (char *) statement->object; } else { obj = (char *) raptor_uri_as_string((raptor_uri *) statement-> object); attr = FS_RID_NULL; o = fs_hash_uri(obj); } buffer_res(data->link, data->segments, s, subj, FS_RID_NULL, data->dryrun); buffer_res(data->link, data->segments, p, pred, FS_RID_NULL, data->dryrun); buffer_res(data->link, data->segments, o, obj, attr, data->dryrun); fs_rid tbuf[4] = { m, s, p, o }; retry_write: if (write(data->quad_fd, tbuf, sizeof(tbuf)) == -1) { fs_error(LOG_ERR, "failed to buffer quad to fd %d (0x%x): %s", data->quad_fd, data->quad_fd, strerror(errno)); if (errno == EAGAIN || errno == EINTR || errno == ENOSPC) { sleep(5); goto retry_write; } } if (data->verbosity > 2) { fprintf(stderr, "%016llx %016llx %016llx %016llx\n", m, s, p, o); } data->count_trip++; total_triples_parsed++; if (data->verbosity && total_triples_parsed % 10000 == 0) { printf("Pass 1, processed %d triples\r", total_triples_parsed); fflush(stdout); } if (total_triples_parsed == FS_CHUNK_SIZE) { if (data->verbosity) printf("Pass 1, processed %d triples (%d)\n", FS_CHUNK_SIZE, data->count_trip); *(data->ext_count) += process_quads(data); data->last_count = data->count_trip; total_triples_parsed = 0; gettimeofday(&then_last, 0); } }