void loadPattern(const string &filename)
{
	FILE* in = tryOpen(filename, "r");
	for (;getLine(in);) {
		vector<string> tokens = splitBy(line, ',');
		string pattern = tolower(tokens[0]);
		int occurrence;
		fromString(tokens[1], occurrence);
		
		patterns[pattern] = occurrence;
		prob[pattern] = occurrence;
		
		quote[pattern] = parenthesis[pattern] = dash[pattern] = capital[pattern] = total[pattern] = 0;
		
		size_t tokensN = splitBy(pattern, ' ').size();
		f[pattern].resize(tokensN, 0);
		sumOutside[pattern].resize(tokensN, 0);
		
		tree.add(" " + pattern + " ");
	}
	fclose(in);
	//cerr << "# Pattern = " << prob.size() << endl;
	
	tree.make();
	//cerr << "Tree is built" << endl;
}
Relation* Select(vector<string> &words, SchemaManager &schema_manager, MainMemory &mem){
	vector<string> select_list, from_list, where_list, order_list;
	bool has_distinct = false, has_where = false, has_orderby = false;
	int i = 1;
	if (words[i] == "DISTINCT"){
		has_distinct = true;
		i++;
	}
	while (i < words.size() && words[i] != "FROM"){
		// drop comma
		select_list.push_back(splitBy(words[i], ",")[0]);
		i++;
	}
	i++; // skip FROM
	while ( i < words.size() && words[i] != "WHERE" && words[i] != "ORDER"){
		from_list.push_back(splitBy(words[i], ",")[0]);
		i++;
	}
	if (i < words.size()){
		if (words[i] == "WHERE"){
			has_where = true;
			i++; // skip WHERE
			while (i < words.size() && words[i] != "ORDER"){
				where_list.push_back(words[i]);
				i++;
			}
		}
		if (i < words.size() && words[i] == "ORDER"){
			has_orderby = true;
			i = i + 2; // skip ORDER BY
			order_list.push_back(words[i]);
			i++;
		}
	}

	// add table name to each column name
	preProcess(from_list, select_list, schema_manager);
	preProcess(from_list, where_list, schema_manager);
	preProcess(from_list, order_list, schema_manager);
	/*
	   print(select_list);
	   print(from_list);
	   print(where_list);
	   print(order_list);
	   */
	Relation* view =  generateLQP(has_distinct, select_list, from_list, where_list, order_list, schema_manager, mem);

	cout<<*view<<endl;
	return view;
}
double segmentationByDP(const string &pattern, MAP_S_D &prob)
{
    vector<string> tokens = splitBy(pattern, ' ');
    vector<double> f(tokens.size() + 1, 0);
    f[0] = 1;
    for (int i = 0; i < (int)tokens.size(); ++ i) {
        string phrase = "";
        for (int j = i; j < (int)tokens.size(); ++ j) {
            if (phrase.size()) {
                phrase += " ";
            }
            phrase += tokens[j];
            if (j - i + 1 == (int)tokens.size()) {
                continue;
            }
            if (prob.count(phrase)) {
                double cur = f[i] * prob[phrase] * penalty;
//                if (j + 1 != (int)tokens.size()) {
//                    cur *= penalty;
//                }
                if (cur > f[j + 1]) {
                    f[j + 1] = cur;
                }
            }
        }
    }
    return f[tokens.size()];
}
vector<string> printStopwords(MAP_S_D &patterns)
{
    vector<string> ret(1, "avg_idf,stop_ratio,first_stop,last_stop");
    char temp[1000];
    FOR (iter, patterns) {
        const string &pattern = iter->first;
        if (pattern.find(' ') == string::npos) {
            continue;
        }
        vector<string> tokens = splitBy(pattern, ' ');
        double sum = 0;
        int cnt = 0, stopCnt = 0;
        FOR (token, tokens) {
            if (word2idf.count(*token)) {
                ++ cnt;
                sum += word2idf[*token];
            }
            if (stopwords.count(*token)) {
                ++ stopCnt;
            }
        }
        if (cnt > 0) {
            sum /= cnt;
        }
        sprintf(temp, "%.10f,%.10f,%d,%d", sum, (double)stopCnt / tokens.size(), stopwords.count(tokens[0]), stopwords.count(tokens[tokens.size() - 1]));
        ret.push_back(temp);
    }
    return ret;
}
void loadProb(const string &prefix)
{
    for (int length = 1; length <= 6; ++ length) {
        char filename[255];
        sprintf(filename, "%s%d.csv", prefix.c_str(), length);
        FILE* in = tryOpen(filename, "r");
        
        if (in == NULL) {
            cerr << "[Warning] No length " << length << " phrases." << endl;
            continue;
        }
        getLine(in);
        double sum = 0;
        for (;getLine(in);) {
            vector<string> tokens = splitBy(line, ',');
    		string pattern = tolower(tokens[0]);
    		double value;
    		fromString(tokens[2], value);
    		prob[pattern] = value;
    		sum += value;
        }
        //fprintf(stderr, "sum %d = %.6f\n", length, sum);
        fclose(in);
    }
    //cerr << "# prob = " << prob.size() << endl;
}
Example #6
0
void joinPaths(Str& relPath, const Str& basePath)
{
    Str segment;
    DStr absPath;
    // append the relPath to all-but-the-last-segment-of-basePath

    Bool endSlash = cutLast(absPath = basePath, 1),
        lastSeg;
    DStr result = absPath + (endSlash? "" : "/") + relPath;
       
    // throw out all '.' from the path
    const char *p = (const char*) result;
    absPath.empty();
    while(splitBy(p, slashes, segment))
    {
        if (!segP(segment, 1))
            absPath += segment + "/";
    }
    if (!segP(segment, 1))
        absPath += segment;

    // throw out all "something/.." from the path
    p = (char*) absPath;
    int depth = 0;
    result.empty();
    do
    {
        lastSeg = (Bool) !splitBy(p, slashes, segment);
        if (!segP(segment, 2))
        {
            result += segment + (lastSeg ? "" : "/");
            depth++;
        }
        else
        {
            if (depth > 1)
            {
                cutLast(result, 2);
                depth--;
            }
            else
                result += segment + (lastSeg ? "" : "/");
        };
    }
    while(!lastSeg);
    relPath = result;
}
vector<string> printStat(MAP_S_D &patterns, MAP_S_D &prob, unordered_map<string, vector<double> > &f, MAP_S_D &total, unordered_map<string, vector<double> > &sumOutside)
{
    vector<string> ret(1, "prob_feature,occur_feature,log_occur_feature,prob_log_occur,constant,outsideSentence,new_outside,frequency");
    char temp[1000];
    FOR (iter, patterns) {
        const string &pattern = iter->first;
        if (pattern.find(' ') == string::npos) {
            continue;
        }
        string AB = "";
        string CD = "";
        double best = -1;
        for (size_t i = 0; i < pattern.size(); ++ i) {
            if (pattern[i] == ' ') {
                string left = pattern.substr(0, i);
                string right = pattern.substr(i + 1);
                double current = prob[left] * prob[right];
                if (current > best) {
                    best = current;
                    AB = left;
                    CD = right;
                }
            }
        }
        double f1 = prob[pattern] / (prob[AB] * prob[CD]);
        double f2 = iter->second / sqrt(patterns[AB] * patterns[CD]);
        double f3 = sqrt(iter->second) * log(f1);
        double f4 = prob[pattern] * log(f1);
        
        vector<string> tokens = splitBy(pattern, ' ');
        double sum = 0, norm = 0;
        for (size_t i = 0; i < tokens.size(); ++ i) {
            if (total[pattern] > 0) {
                f[pattern][i] /= total[pattern];
            }
            double wi = log(prob[tokens[i]]);
            sum += sqr(f[pattern][i]) * sqr(wi);
            norm += sqr(wi);
        }
        double outside = sqrt(sum / norm);
        
        sum = norm = 0;
        for (size_t i = 0; i < tokens.size(); ++ i) {
            sum += sumOutside[pattern][i] * word2idf[tokens[i]];
            norm += word2idf[tokens[i]];
        }
        if (total[pattern] > 0) {
            sum /= total[pattern];
        }
        double newOutside = sum / norm;
        
        sprintf(temp, "%.10f,%.10f,%.10f,%.10f,1,%.10f,%.10f,%.0f", f1, f2, f3, f4, outside, newOutside, iter->second);
        ret.push_back(temp);
    }
    return ret;
}
Example #8
0
URIScheme uri2SchemePath(Sit S, const char *absolute, Str& scheme, Str& rest)
{
    Bool found = (Bool) !!splitBy(absolute, ":", scheme);
    sabassert(found);
    rest = (char*) absolute;
/*
 *    if (isSlash(*absolute) && isSlash(absolute[1]))
 *       rest = (char*) absolute + 2;
 *   else
 *       rest = (char*) absolute;
 */
    return schemeToURI_(S, scheme);
}
 Data(string line, bool isTrain) {
     vector<string> tokens = splitBy(line, ',');
     if (isTrain) {
         values = tokens;
     } else {
         for (int i = 0; i < tokens.size(); ++ i) {
             values.push_back(tokens[i]);
             if (i == 0) {
                 values.push_back("0"); // fake a value for IsBadBuy for testing data
             }
         }
     }
 }
vector<string> loadFeatureTable(const string &filename)
{
	FILE* in = tryOpen(filename, "r");
	vector<string> ret;
	getLine(in);
	ret.push_back(line);
	for (;getLine(in);) {
		vector<string> tokens = splitBy(line, ',');
		string pattern = tolower(tokens[0]);
		patterns[pattern] = line;
	}
	FOR (iter, patterns) {
        ret.push_back(iter->second);
	}
void loadPattern(const string &filename)
{
	FILE* in = tryOpen(filename, "r");
	for (;getLine(in);) {
		vector<string> tokens = splitBy(line, ',');
		string pattern = tolower(tokens[0]);
		int occurrence;
		fromString(tokens[1], occurrence);

		oldProb[pattern] = occurrence;
	}
	fclose(in);
//	cerr << "# old prob = " << oldProb.size() << endl;
}
Example #12
0
void splitURI(const char *uri, FiveStr &parts)
{
    const char *rest;
    char c;
    for (int i = 0; i < 5; i++) 
        parts[i].empty();
    RF( uri && *uri );
    // extract the scheme part of the URI
    if (!splitBy(rest = uri, ":", parts[U_SCHEME]))
        parts[U_SCHEME].empty();
    // if "//" follows, extract the authority part
    c = 'A';    // marks the absence of auth
    if (isSlash(*rest) && isSlash(rest[1]))
        RF( c = splitBy(rest += 2, slashes"?#", parts[U_AUTH]) );
    if (isSlash(c) || c == 'A')
      // extract the path
      RF( c = splitBy(rest -= (isSlash(c)), "?#", parts[U_PATH]) );
    //query and fragment
    if (c == '?')
        // extract the query
        RF( c = splitBy(rest, "#", parts[U_QUERY]) );
    // copy the fragment
    parts[U_FRAG] = (char *) rest;
};
void DataDrivenNumberFormatTestSuite::setTupleField(UErrorCode &status) {
    if (U_FAILURE(status)) {
        return;
    }
    UnicodeString parts[3];
    int32_t partCount = splitBy(parts, UPRV_LENGTHOF(parts), 0x20);
    if (partCount < 3) {
        showError("Set expects 2 parameters");
        status = U_PARSE_ERROR;
        return;
    }
    if (!fTuple.setField(
            NumberFormatTestTuple::getFieldByName(parts[1]),
            parts[2].unescape(),
            status)) {
        showError("Invalid field value");
    }
}
void loadStopwords(string stopFile, string idfFile)
{
    FILE* in = tryOpen(idfFile, "r");
    for (;getLine(in);) {
        vector<string> tokens = splitBy(line, ',');
        if (tokens.size() == 2) {
            string word = tokens[0];
            double idf;
            fromString(tokens[1], idf);
            word2idf[word] = idf;
        } else {
            cerr << line << endl;
        }
    }
    in = tryOpen(stopFile, "r");
    for (;getLine(in);) {
        stopwords[line] = 1;
    }
}
void DataDrivenNumberFormatTestSuite::run(const char *fileName, UBool runAllTests) {
    fFileLineNumber = 0;
    fFormatTestNumber = 0;
    UErrorCode status = U_ZERO_ERROR;
    for (int32_t i = 0; i < UPRV_LENGTHOF(fPreviousFormatters); ++i) {
        delete fPreviousFormatters[i];
        fPreviousFormatters[i] = newFormatter(status);
    }
    if (!assertSuccess("Can't create previous formatters", status)) {
        return;
    }
    CharString path(getSourceTestData(status), status);
    path.appendPathPart(fileName, status);
    const char *codePage = "UTF-8";
    LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, &status));
    if (!assertSuccess("Can't open data file", status)) {
        return;
    }
    UnicodeString columnValues[kNumberFormatTestTupleFieldCount];
    ENumberFormatTestTupleField columnTypes[kNumberFormatTestTupleFieldCount];
    int32_t columnCount;
    int32_t state = 0;
    while(U_SUCCESS(status)) {
        // Read a new line if necessary.
        if(fFileLine.isEmpty()) {
            if(!readLine(f.getAlias(), status)) { break; }
            if (fFileLine.isEmpty() && state == 2) {
                state = 0;
            }
            continue;
        }
        if (fFileLine.startsWith("//")) {
            fFileLine.remove();
            continue;
        }
        // Initial setup of test.
        if (state == 0) {
            if (fFileLine.startsWith(UNICODE_STRING("test ", 5))) {
                fFileTestName = fFileLine;
                fTuple.clear();
            } else if(fFileLine.startsWith(UNICODE_STRING("set ", 4))) {
                setTupleField(status);
            } else if(fFileLine.startsWith(UNICODE_STRING("begin", 5))) {
                state = 1;
            } else {
                showError("Unrecognized verb.");
                return;
            }
        // column specification
        } else if (state == 1) {
            columnCount = splitBy(columnValues, UPRV_LENGTHOF(columnValues), 0x9);
            for (int32_t i = 0; i < columnCount; ++i) {
                columnTypes[i] = NumberFormatTestTuple::getFieldByName(
                    columnValues[i]);
                if (columnTypes[i] == kNumberFormatTestTupleFieldCount) {
                    showError("Unrecognized field name.");
                    return;
                }
            }
            state = 2;
        // run the tests
        } else {
            int32_t columnsInThisRow = splitBy(columnValues, columnCount, 0x9);
            for (int32_t i = 0; i < columnsInThisRow; ++i) {
                fTuple.setField(
                        columnTypes[i], columnValues[i].unescape(), status);
            }
            for (int32_t i = columnsInThisRow; i < columnCount; ++i) {
                fTuple.clearField(columnTypes[i], status);
            }
            if (U_FAILURE(status)) {
                showError("Invalid column values");
                return;
            }
            if (!breaksC() || runAllTests) {
                UnicodeString errorMessage;
                if (!isPass(fTuple, errorMessage, status)) {
                    showFailure(errorMessage);
                }
            }
        }
        fFileLine.remove();
    }
}
Relation* Insert(vector<string> &words, string &line, SchemaManager &schema_manager, MainMemory &mem){
	Relation* relation_ptr = schema_manager.getRelation(words[2]);

	vector<string>::iterator it = find(words.begin(), words.end(), "SELECT");
	// no select
	if (it == words.end()){
		// get insert vals
		vector<string> content = splitBy(line, "()");
		vector<string> fields = splitBy(content[1], ", ");
		vector<string> vals = splitBy(content[3], ",");
		//preProcess(vector<string>(1, words[2]), fields, schema_manager);
		preProcess(vector<string>(1, words[2]), vals, schema_manager);

		assert(fields.size() == vals.size());

		Tuple tuple = relation_ptr->createTuple();

		// standard insert doesn't have table names
		vector<string> col_names = nakedFieldNames(relation_ptr);

		// comparing 
		for (int i = 0; i < fields.size(); i++){
			for (int j = 0; j < col_names.size(); j++){
				// this is a match
				if (fields[i] == col_names[j]){
					if (tuple.getSchema().getFieldType(j) == INT){
						tuple.setField(j, atoi(vals[i].c_str()));
					}
					else{
						tuple.setField(j, vals[i]);
					}
					break;
				}
			}
		}
		appendTupleToRelation(relation_ptr, mem, tuple);
	}
	// with SELECT
	else{
		vector<string> SFW(it, words.end());	
		Relation* new_relation = Select(SFW, schema_manager, mem);
		assert(new_relation);

		vector<string> new_field_names = nakedFieldNames(new_relation);
		vector<string> field_names = nakedFieldNames(relation_ptr);

		// mapping: index of new_field_names to field_names 
		vector<int> mapping(new_field_names.size(), -1);
		for (int i = 0; i < new_field_names.size(); i++){
			for (int j = 0; j < field_names.size(); j++){
				if (new_field_names[i] == field_names[j]){
					mapping[i] = j;
					break;
				}
			}
		}

		int new_field_size = new_relation->getSchema().getNumOfFields();

		// warning: new_relation and relation_ptr might be the same!
		// get all tuples from the new_relation in one run
		vector<Tuple> new_tuples;
		for (int i = 0; i < new_relation->getNumOfBlocks(); i++){

			assert(!free_blocks.empty());
			int memory_block_index = free_blocks.front();
			free_blocks.pop();

			// read the relation block by block
			new_relation->getBlock(i, memory_block_index);
			Block* block_ptr = mem.getBlock(memory_block_index);
			assert(block_ptr);
			vector<Tuple> block_tuples = block_ptr->getTuples();
			new_tuples.insert(new_tuples.end(), block_tuples.begin(), block_tuples.end());
			if(new_tuples.empty()){
				cerr<<"Warning: Insert from SFW, No tuples in the current mem block!"<<endl;
			}
			free_blocks.push(memory_block_index);
		}

		for (int j = 0; j < new_tuples.size(); j++){
			Tuple tuple = relation_ptr->createTuple();
			for (int k = 0; k < new_field_size; k++){
				if (mapping[k] != -1){
					int idx = mapping[k];
					assert(idx < relation_ptr->getSchema().getNumOfFields() && idx >= 0);
					if (tuple.getSchema().getFieldType(idx) == INT){
						int val = new_tuples[j].getField(k).integer;
						tuple.setField(field_names[idx], val);
					}
					else{
						string *str = new_tuples[j].getField(k).str;
						tuple.setField(field_names[idx], *str);
					}
				}
			}
			appendTupleToRelation(relation_ptr, mem, tuple);
		}
		cout<<*relation_ptr<<endl;
	}
	return relation_ptr;
}
int main(int argc, char* argv[])
{
    if (argc != 6) {
        cerr << "[usage] <sentencesText.buf> <patterns.csv> <stopwords.txt> <stopwordsFromText.txt> <final.csv>" << endl;
        return -1;
    }
    selftest();
    
    loadSentences(argv[1]);
    loadPattern(argv[2]);
    loadStopwords(argv[3], argv[4]);
    
    int corpusTokensN = 0;
    for (size_t sentenceID = 0; sentenceID < sentences.size(); ++ sentenceID) {
        const string &text = sentences[sentenceID];
        string alpha = text;
        for (size_t i = 0; i < alpha.size(); ++ i) {
            if (isalpha(alpha[i])) {
                alpha[i] = tolower(alpha[i]);
            } else {
                if (alpha[i] != '\'') {
					alpha[i] = ' ';
				}
            }
        }
        corpusTokensN += splitBy(alpha, ' ').size();
        
        string outsideText = alpha;
        if (sentenceID > 0) {
            outsideText += " " + sentences[sentenceID - 1];
        }
        if (sentenceID + 1 < sentences.size()) {
            outsideText += " " + sentences[sentenceID + 1];
        }
        for (size_t i = 0; i < outsideText.size(); ++ i) {
            if (isalpha(outsideText[i])) {
                outsideText[i] = tolower(outsideText[i]);
            } else {
                outsideText[i] = ' ';
            }
        }
        
        vector<string> outside = splitBy(outsideText, ' ');
        unordered_map<string, int> outsideCnt;
        FOR (token, outside) {
            ++ outsideCnt[*token];
        }
        
        vector< pair<int, int> > positions;
        tree.search(" " + alpha + " ", positions);
        
        unordered_map<string, int> patternCnt;
        FOR (pos, positions) {
            int st = pos->first;
            int ed = pos->second - 2;
            string pattern = alpha.substr(st, ed - st);
            ++ patternCnt[pattern];
        }
        FOR (pos, positions) {
            int st = pos->first;
            int ed = pos->second - 2;
            string pattern = alpha.substr(st, ed - st);
            
            vector<string> tokens = splitBy(pattern, ' ');
            unordered_map<string, int> tokenCnt;
            int delta = patternCnt[pattern];
            for (size_t i = 0; i < tokens.size(); ++ i) {
                tokenCnt[tokens[i]] += delta;
            }
            for (size_t i = 0; i < tokens.size(); ++ i) {
                if (outsideCnt[tokens[i]] > tokenCnt[tokens[i]]) {
                    f[pattern][i] += 1;
                    sumOutside[pattern][i] += outsideCnt[tokens[i]] - tokenCnt[tokens[i]];
                }
            }
            
            total[pattern] += 1;
            
            if (st > 0 && ed < (int)text.size()) {
                if (text[st - 1] == '(' && text[ed] == ')') {
                    parenthesis[pattern] += 1;
                }
                if (text[st - 1] == '"' && text[ed] == '"') {
                    quote[pattern] += 1;
                }
            }
            
            bool found = false;
            for (int i = st; i < ed && !found; ++ i) {
                found |= text[i] == '-';
            }
            dash[pattern] += found;
            
            bool valid = true;
            for (int i = st; i < ed && valid; ++ i) {
                if (isalpha(alpha[i]) && (i == st || alpha[i - 1] == ' ')) {
                    if (text[i] < 'A' && text[i] > 'Z') {
                        valid = false;
                    }
                }
            }
            capital[pattern] += valid;
        }