// _____________________________________________________________________________ bool ContextFileParser::getLine(ContextFileParser::Line& line) { string l; if (std::getline(_in, l)) { size_t i = l.find('\t'); assert(i != string::npos); size_t j = i + 2; assert(j + 3 < l.size()); size_t k = l.find('\t', j + 2); assert(k != string::npos); line._isEntity = (l[i + 1] == '1'); line._word = (line._isEntity ? l.substr(0, i) : ad_utility::getLowercaseUtf8(l.substr(0, i))); line._contextId = static_cast<Id>(atol(l.substr(j + 1, k - j - 1).c_str())); line._score = static_cast<Score>(atol(l.substr(k + 1).c_str())); #ifndef NDEBUG if (_lastCId > line._contextId) { AD_THROW(ad_semsearch::Exception::BAD_INPUT, "ContextFile has to be sorted by context Id."); } _lastCId = line._contextId; #endif return true; } return false; }
// _____________________________________________________________________________ string Server::createQueryFromHttpParams(const ParamValueMap& params) const { string query; // Construct a Query object from the parsed request. auto it = params.find("query"); if (it == params.end() || it->second == "") { AD_THROW(ad_semsearch::Exception::BAD_REQUEST, "Expected at least one non-empty attribute \"query\"."); } return it->second; }
// _____________________________________________________________________________ void SparqlParser::addFilter(const string& str, ParsedQuery& query) { size_t i = str.find('('); AD_CHECK(i != string::npos); size_t j = str.find(')', i + 1); AD_CHECK(j != string::npos); string filter = str.substr(i + 1, j - i - 1); auto tokens = ad_utility::split(filter, ' '); if (tokens.size() != 3) { AD_THROW(ad_semsearch::Exception::BAD_QUERY, "Unknown syntax for filter: " + filter); } if (tokens[0].size() == 0 || tokens[0][0] != '?' || tokens[2].size() == 0 || tokens[2][0] != '?') { AD_THROW(ad_semsearch::Exception::NOT_YET_IMPLEMENTED, "Filter not supported yet: " + filter); } SparqlFilter f; f._lhs = tokens[0]; f._rhs = tokens[2]; if (tokens[1] == "=" || tokens[1] == "==") { f._type = SparqlFilter::EQ; } else if (tokens[1] == "!=") { f._type = SparqlFilter::NE; } else if (tokens[1] == "<") { f._type = SparqlFilter::LT; } else if (tokens[1] == "<=") { f._type = SparqlFilter::LE; } else if (tokens[1] == "<") { f._type = SparqlFilter::GT; } else if (tokens[1] == ">=") { f._type = SparqlFilter::GE; } else { AD_THROW(ad_semsearch::Exception::NOT_YET_IMPLEMENTED, "Filter not supported yet: " + filter); } query._filters.emplace_back(f); }
// _____________________________________________________________________________ Server::ParamValueMap Server::parseHttpRequest( const string& httpRequest) const { LOG(DEBUG) << "Parsing HTTP Request." << endl; ParamValueMap params; _requestProcessingTimer.start(); // Parse the HTTP Request. size_t indexOfGET = httpRequest.find("GET"); size_t indexOfHTTP = httpRequest.find("HTTP"); if (indexOfGET == httpRequest.npos || indexOfHTTP == httpRequest.npos) { AD_THROW(ad_semsearch::Exception::BAD_REQUEST, "Invalid request. Only supporting proper HTTP GET requests!\n" + httpRequest); } string request = httpRequest.substr(indexOfGET + 3, indexOfHTTP - (indexOfGET + 3)); size_t index = request.find("?"); if (index == request.npos) { AD_THROW(ad_semsearch::Exception::BAD_REQUEST, "Invalid request. At least one parameters is " "required for meaningful queries!\n" + httpRequest); } size_t next = request.find('&', index + 1); while (next != request.npos) { size_t posOfEq = request.find('=', index + 1); if (posOfEq == request.npos) { AD_THROW(ad_semsearch::Exception::BAD_REQUEST, "Parameter without \"=\" in HTTP Request.\n" + httpRequest); } string param = ad_utility::getLowercaseUtf8( request.substr(index + 1, posOfEq - (index + 1))); string value = ad_utility::decodeUrl( request.substr(posOfEq + 1, next - (posOfEq + 1))); if (params.count(param) > 0) { AD_THROW(ad_semsearch::Exception::BAD_REQUEST, "Duplicate HTTP parameter: " + param); } params[param] = value; index = next; next = request.find('&', index + 1); } size_t posOfEq = request.find('=', index + 1); if (posOfEq == request.npos) { AD_THROW(ad_semsearch::Exception::BAD_REQUEST, "Parameter without \"=\" in HTTP Request." + httpRequest); } string param = ad_utility::getLowercaseUtf8( request.substr(index + 1, posOfEq - (index + 1))); string value = ad_utility::decodeUrl(request.substr(posOfEq + 1, request.size() - 1 - (posOfEq + 1))); if (params.count(param) > 0) { AD_THROW(ad_semsearch::Exception::BAD_REQUEST, "Duplicate HTTP parameter."); } params[param] = value; LOG(DEBUG) << "Done parsing HTTP Request." << endl; return params; }
// _____________________________________________________________________________ void SparqlParser::addWhereTriple(const string& str, ParsedQuery& query) { size_t i = 0; while (i < str.size() && (str[i] == ' ' || str[i] == '\t' || str[i] == '\n')) { ++i; } if (i == str.size()) { AD_THROW(ad_semsearch::Exception::BAD_QUERY, "Illegal triple: " + str); } size_t j = i + 1; while (j < str.size() && str[j] != '\t' && str[j] != ' ' && str[j] != '\n') { ++j; } if (j == str.size()) { AD_THROW(ad_semsearch::Exception::BAD_QUERY, "Illegal triple: " + str); } string s = str.substr(i, j - i); i = j; while (i < str.size() && (str[i] == ' ' || str[i] == '\t' || str[i] == '\n')) { ++i; } if (i == str.size()) { AD_THROW(ad_semsearch::Exception::BAD_QUERY, "Illegal triple: " + str); } j = i + 1; while (j < str.size() && str[j] != '\t' && str[j] != ' ' && str[j] != '\n') { ++j; } string p = str.substr(i, j - i); if (p == IN_CONTEXT_RELATION || p.find(IN_CONTEXT_RELATION_NS) != string::npos) { string o = ad_utility::strip(str.substr(j), " \t\n"); query._whereClauseTriples.push_back(SparqlTriple(s, p, o)); } else { i = j; while (i < str.size() && (str[i] == ' ' || str[i] == '\t' || str[i] == '\n')) { ++i; } if (i == str.size()) { AD_THROW(ad_semsearch::Exception::BAD_QUERY, "Illegal triple: " + str); } if (str[i] == '<') { // URI j = str.find('>', i + 1); if (j == string::npos) { AD_THROW(ad_semsearch::Exception::BAD_QUERY, "Illegal object in : " + str); } ++j; } else { if (str[i] == '\"') { // Literal j = str.find('\"', i + 1); if (j == string::npos) { AD_THROW(ad_semsearch::Exception::BAD_QUERY, "Illegal literal in : " + str); } ++j; } else { j = i + 1; } while (j < str.size() && str[j] != ' ' && str[j] != '\t' && str[j] != '\n') { ++j; } } string o = str.substr(i, j - i); query._whereClauseTriples.push_back(SparqlTriple(s, p, o)); } }
// _____________________________________________________________________________ void SparqlParser::parseWhere(const string& str, ParsedQuery& query) { size_t i = str.find('{'); size_t j = str.find('}', i); assert(j != string::npos); if (i == string::npos) { throw ParseException("Need curly braces in where clause."); } string inner = ad_utility::strip(str.substr(i + 1, j - i - 1), "\n\t "); // Split where clauses. Cannot simply split at dots, because they may occur // in URIs, stuff with namespaces or literals. vector<string> clauses; vector<string> filters; size_t start = 0; bool insideUri = false; bool insideNsThing = false; bool insideLiteral = false; while (start < inner.size()) { size_t k = start; while (inner[k] == ' ' || inner[k] == '\t' || inner[k] == '\n') { ++k; } if (inner[k] == 'F') { if (inner.substr(k, 6) == "FILTER") { size_t end = inner.find(')', k); if (end == string::npos) { AD_THROW(ad_semsearch::Exception::BAD_QUERY, "Filter without closing paramthesis."); } filters.push_back(inner.substr(k, end - k + 1)); size_t posOfDot = inner.find('.', end); start = (posOfDot == string::npos ? end + 1 : posOfDot + 1); continue; } } while (k < inner.size()) { if (!insideUri && !insideLiteral && !insideNsThing) { if (inner[k] == '.') { clauses.emplace_back(inner.substr(start, k - start)); break; } if (inner[k] == '<') { insideUri = true; } if (inner[k] == '\"') { insideLiteral = true; } if (inner[k] == ':') { insideNsThing = true; } } else { if (insideUri && inner[k] == '>') { insideUri = false; } if (insideLiteral && inner[k] == '\"') { insideUri = false; } if (insideNsThing && (inner[k] == ' ' || inner[k] == '\t')) { insideNsThing = false; } } ++k; } if (k == inner.size()) { clauses.emplace_back(inner.substr(start)); } start = k + 1; } for (const string& clause: clauses) { string c = ad_utility::strip(clause, ' '); if (c.size() > 0) { addWhereTriple(c, query); } } for (const string& filter: filters) { addFilter(filter, query); } }
// _____________________________________________________________________________ void ScanningJoin::computeResult(ResultTable* result) { AD_THROW(ad_semsearch::Exception::NOT_YET_IMPLEMENTED, "TODO"); IndexScan::computeResult(result); }
// _____________________________________________________________________________ vector<QueryPlanner::SubtreePlan> QueryPlanner::merge( const vector<QueryPlanner::SubtreePlan>& a, const vector<QueryPlanner::SubtreePlan>& b, const QueryPlanner::TripleGraph& tg) const { // TODO: Add the following features: // If a join is supposed to happen, always check if it happens between // a scan with a relatively large result size // esp. with an entire relation but also with something like is-a Person // If that is the case look at the size estimate for the other side, // if that is rather small, replace the join and scan by a combination. std::unordered_map<string, vector<SubtreePlan>> candidates; // Find all pairs between a and b that are connected by an edge. for (size_t i = 0; i < a.size(); ++i) { for (size_t j = 0; j < b.size(); ++j) { if (connected(a[i], b[j], tg)) { // Find join variable(s) / columns. auto jcs = getJoinColumns(a[i], b[j]); if (jcs.size() != 1) { // TODO: Add joins with secondary join columns. AD_THROW(ad_semsearch::Exception::NOT_YET_IMPLEMENTED, "Joins should happen on one variable only, for now. " "No cyclic queries either, currently."); } // Check if a sub-result has to be re-sorted // TODO: replace with HashJoin maybe (or add variant to possible plans). QueryExecutionTree left(_qec); QueryExecutionTree right(_qec); if (a[i]._qet.resultSortedOn() == jcs[0][0]) { left = a[i]._qet; } else { // Create a sort operation. Sort sort(_qec, a[i]._qet, jcs[0][0]); left.setVariableColumns(a[i]._qet.getVariableColumnMap()); left.setOperation(QueryExecutionTree::SORT, &sort); } if (b[j]._qet.resultSortedOn() == jcs[0][1]) { right = b[j]._qet; } else { // Create a sort operation. Sort sort(_qec, b[j]._qet, jcs[0][1]); right.setVariableColumns(b[j]._qet.getVariableColumnMap()); right.setOperation(QueryExecutionTree::SORT, &sort); } // Create the join operation. QueryExecutionTree tree(_qec); Join join(_qec, left, right, jcs[0][0], jcs[0][1]); tree.setVariableColumns(join.getVariableColumns()); tree.setOperation(QueryExecutionTree::JOIN, &join); SubtreePlan plan(_qec); plan._qet = tree; plan._idsOfIncludedFilters = a[i]._idsOfIncludedFilters; plan._idsOfIncludedNodes = a[i]._idsOfIncludedNodes; plan._idsOfIncludedNodes.insert( b[j]._idsOfIncludedNodes.begin(), b[j]._idsOfIncludedNodes.end()); candidates[getPruningKey(plan, jcs[0][0])].emplace_back(plan); } } } // Duplicates are removed if the same triples are touched, // the ordering is the same. Only the best is kept then. // Therefore we mapped plans and use contained triples + ordering var // as key. vector<SubtreePlan> prunedPlans; for (auto it = candidates.begin(); it != candidates.end(); ++it) { size_t minCost = std::numeric_limits<size_t>::max(); size_t minIndex = 0; for (size_t i = 0; i < it->second.size(); ++i) { if (it->second[i].getCostEstimate() < minCost) { minCost = it->second[i].getCostEstimate(); minIndex = i; } } prunedPlans.push_back(it->second[minIndex]); } return prunedPlans; }
// _____________________________________________________________________________ vector<QueryPlanner::SubtreePlan> QueryPlanner::seedWithScans( const QueryPlanner::TripleGraph& tg) const { vector<SubtreePlan> seeds; for (size_t i = 0; i < tg._nodeMap.size(); ++i) { const TripleGraph::Node& node = *tg._nodeMap.find(i)->second; if (node._variables.size() == 0) { AD_THROW(ad_semsearch::Exception::NOT_YET_IMPLEMENTED, "Triples should have at least one variable. Not the case in: " + node._triple.asString()); } if (node._variables.size() == 1) { // Just pick one direction, they should be equivalent. SubtreePlan plan(_qec); plan._idsOfIncludedNodes.insert(i); QueryExecutionTree tree(_qec); if (isVariable(node._triple._s)) { IndexScan scan(_qec, IndexScan::ScanType::POS_BOUND_O); scan.setPredicate(node._triple._p); scan.setObject(node._triple._o); scan.precomputeSizeEstimate(); tree.setOperation(QueryExecutionTree::OperationType::SCAN, &scan); tree.setVariableColumn(node._triple._s, 0); } else if (isVariable(node._triple._o)) { IndexScan scan(_qec, IndexScan::ScanType::PSO_BOUND_S); scan.setPredicate(node._triple._p); scan.setSubject(node._triple._s); scan.precomputeSizeEstimate(); tree.setOperation(QueryExecutionTree::OperationType::SCAN, &scan); tree.setVariableColumn(node._triple._o, 0); } else { // Pred variable. AD_THROW(ad_semsearch::Exception::NOT_YET_IMPLEMENTED, "No predicate vars yet, please. Triple in question: " + node._triple.asString()); } plan._qet = tree; seeds.push_back(plan); } if (node._variables.size() == 2) { // Add plans for both possible scan directions. if (isVariable(node._triple._p)) { // Pred variable. AD_THROW(ad_semsearch::Exception::NOT_YET_IMPLEMENTED, "No predicate vars yet, please. Triple in question: " + node._triple.asString()); } { SubtreePlan plan(_qec); plan._idsOfIncludedNodes.insert(i); QueryExecutionTree tree(_qec); IndexScan scan(_qec, IndexScan::ScanType::PSO_FREE_S); scan.setPredicate(node._triple._p); scan.precomputeSizeEstimate(); tree.setOperation(QueryExecutionTree::OperationType::SCAN, &scan); tree.setVariableColumn(node._triple._s, 0); tree.setVariableColumn(node._triple._o, 1); plan._qet = tree; seeds.push_back(plan); } { SubtreePlan plan(_qec); plan._idsOfIncludedNodes.insert(i); QueryExecutionTree tree(_qec); IndexScan scan(_qec, IndexScan::ScanType::POS_FREE_O); scan.setPredicate(node._triple._p); scan.precomputeSizeEstimate(); tree.setOperation(QueryExecutionTree::OperationType::SCAN, &scan); tree.setVariableColumn(node._triple._o, 0); tree.setVariableColumn(node._triple._s, 1); plan._qet = tree; seeds.push_back(plan); } } if (node._variables.size() >= 3) { AD_THROW(ad_semsearch::Exception::NOT_YET_IMPLEMENTED, "Triples should have at most two variables. Not the case in: " + node._triple.asString()); } } return seeds; }
// _____________________________________________________________________________ QueryExecutionTree QueryPlanner::createExecutionTree( const ParsedQuery& pq) const { LOG(DEBUG) << "Creating execution plan.\n"; // Strategy: // Create a graph. // Each triple corresponds to a node, there is an edge between two nodes iff // they share a variable. TripleGraph tg = createTripleGraph(pq); // Each node/triple corresponds to a scan (more than one way possible), // each edge corresponds to a possible join. // Enumerate and judge possible query plans using a DP table. // Each ExecutionTree for a sub-problem gives an estimate. // Start bottom up, i.e. with the scans for triples. // Always merge two solutions from the table by picking one possible join. // A join is possible, if there is an edge between the results. // Therefore we keep track of all edges that touch a sub-result. // When joining two sub-results, the results edges are those that belong // to exactly one of the two input sub-trees. // If two of them have the same target, only one out edge is created. // All edges that are shared by both subtrees, are checked if they are covered // by the join or if an extra filter/select is needed. // The algorithm then creates all possible plans for 1 to n triples. // To generate a plan for k triples, all subsets between i and k-i are // joined. // Filters are now added to the mix when building execution plans. // Without them, a plan has an execution tree and a set of // covered triple nodes. // With them, it also has a set of covered filters. // A filter can be applied as soon as all variables that occur in the filter // Are covered by the query. This is also always the place where this is done. // TODO: resolve cyclic queries and turn them into filters. // Copy made so that something can be added for cyclic queries. // tg.turnCyclesIntoFilters(filters); // TODO: resolve cycles involving a text operation. // Split the graph at possible text operations. vector<pair<TripleGraph, vector<SparqlFilter>>> graphs; unordered_map<string, vector<size_t>> contextVarTotextNodes; vector<SparqlFilter> filtersWithContextVars; tg.splitAtText(pq._filters, graphs, contextVarTotextNodes, filtersWithContextVars); vector<vector<SubtreePlan>> finalTab; if (graphs.size() == 1) { finalTab = fillDpTab(graphs[0].first, graphs[0].second); } else { AD_THROW(ad_semsearch::Exception::NOT_YET_IMPLEMENTED, "No text yet."); } // If there is an order by clause, add another row to the table and // just add an order by / sort to every previous result if needed. // If the ordering is perfect already, just copy the plan. if (pq._orderBy.size() > 0) { finalTab.emplace_back(getOrderByRow(pq, finalTab)); } vector<SubtreePlan>& lastRow = finalTab.back(); AD_CHECK_GT(lastRow.size(), 0); size_t minCost = lastRow[0].getCostEstimate(); size_t minInd = 0; for (size_t i = 1; i < lastRow.size(); ++i) { if (lastRow[i].getCostEstimate() < minCost) { minCost = lastRow[i].getCostEstimate(); minInd = i; } } // A distinct modifier is applied in the end. This is very easy // but not necessarily optimal. // TODO: Adjust so that the optimal place for the operation is found. if (pq._distinct) { QueryExecutionTree distinctTree(lastRow[minInd]._qet); vector<size_t> keepIndices; for (const auto& var : pq._selectedVariables) { if (lastRow[minInd]._qet.getVariableColumnMap().find(var) != lastRow[minInd]._qet.getVariableColumnMap().end()) { keepIndices.push_back( lastRow[minInd]._qet.getVariableColumnMap().find( var)->second); } } Distinct distinct(_qec, lastRow[minInd]._qet, keepIndices); distinctTree.setOperation(QueryExecutionTree::DISTINCT, &distinct); return distinctTree; } LOG(DEBUG) << "Done creating execution plan.\n"; return lastRow[minInd]._qet; }