void ToEnglishTranslationVisitor::visit(const boost::shared_ptr<Element>& e) { if (_tagKeys.isEmpty()) { throw HootException("No tag keys specified for language translation."); } LOG_VART(e); //if this var was set while parsing the previous element, increment the counter now if (_currentElementHasSuccessfulTagTranslation) { _numElementsWithSuccessfulTagTranslation++; } _currentElementHasSuccessfulTagTranslation = false; const Tags& tags = e->getTags(); bool elementProcessed = false; for (QSet<QString>::const_iterator tagKeysItr = _tagKeys.begin(); tagKeysItr != _tagKeys.end(); ++tagKeysItr) { const QString toTranslateTagKey = *tagKeysItr; if (tags.contains(toTranslateTagKey)) { //making skipping tags that already have an english translated tag optional, b/c a many of the //OSM english translations I've seen are either just copies of the foreign language text or are //not very good translations const QString preTranslatedTagKey = toTranslateTagKey + ":en"; if (!_ignorePreTranslatedTags && tags.contains(preTranslatedTagKey)) { LOG_TRACE( "Skipping element with pre-translated tag: " << preTranslatedTagKey << "=" << tags.get(toTranslateTagKey).trimmed()); } else { _translate(e, toTranslateTagKey); elementProcessed = true; } } } if (elementProcessed) { _numProcessedElements++; if (_numProcessedElements % _taskStatusUpdateInterval == 0) { PROGRESS_INFO("Attempted tag translation for " << _numProcessedElements << " elements."); } } _numTotalElements++; if (_numTotalElements % _taskStatusUpdateInterval == 0) { PROGRESS_INFO("Visited " << _numTotalElements << " elements."); } }
void ImplicitTagRawRulesDeriver::_resolveCountTies() { //Any time more than one word/key combo has the same occurrence count, we need to pick just one //of them. LOG_INFO( "Resolving word/tag key/count ties for " << StringUtils::formatLargeNumber(_duplicatedWordTagKeyCountsToValues.size()) << " duplicated word/tag key/counts..."); _tieResolvedCountFile.reset( new QTemporaryFile( _tempFileDir + "/implicit-tag-raw-rules-generator-temp-XXXXXX")); _tieResolvedCountFile->setAutoRemove(!_keepTempFiles); if (!_tieResolvedCountFile->open()) { throw HootException( QObject::tr("Error opening %1 for writing.").arg(_tieResolvedCountFile->fileName())); } LOG_DEBUG("Opened tie resolve temp file: " << _tieResolvedCountFile->fileName()); if (_keepTempFiles) { LOG_WARN("Keeping temp file: " << _tieResolvedCountFile->fileName()); } if (!_dedupedCountFile->open()) { throw HootException( QObject::tr("Error opening %1 for reading.").arg(_dedupedCountFile->fileName())); } long lineCount = 0; long duplicateResolutions = 0; while (!_dedupedCountFile->atEnd()) { const QString line = QString::fromUtf8(_dedupedCountFile->readLine().constData()).trimmed(); LOG_VART(line); const QStringList lineParts = line.split("\t"); LOG_VART(lineParts); QString word = lineParts[1].trimmed(); LOG_VART(word); const QString kvp = lineParts[2].trimmed(); LOG_VART(kvp); const QString countStr = lineParts[0].trimmed(); const long count = countStr.toLong(); LOG_VART(count); const QStringList kvpParts = kvp.split("="); const QString tagKey = kvpParts[0]; LOG_VART(tagKey); const QString wordTagKey = word.trimmed() % ";" % tagKey.trimmed(); LOG_VART(wordTagKey); const QString wordTagKeyCount = word.trimmed() % ";" % tagKey.trimmed() % ";" % countStr.trimmed(); LOG_VART(wordTagKeyCount); const QString tagValue = kvpParts[1]; LOG_VART(tagValue); if (_duplicatedWordTagKeyCountsToValues.contains(wordTagKeyCount)) { LOG_TRACE("Resolving duplicated word/tag key/count for " << wordTagKeyCount << "..."); //To resolve the tie, we're going to pick the most specific kvp. e.g. amenity=public_hall //wins out of amenity=hall. This is not really dealing with same hierarchy level tags //(e.g. amenity=school and amenity=hall) and will just arbitrarily pick in that situation. //Duplicates do seem to be fairly rare, but there could be some perfomance gains by coming //up with a better way to handle this situation. QString lineWithMostSpecificKvp = line % "\n"; const QStringList tagValues = _duplicatedWordTagKeyCountsToValues[wordTagKeyCount]; for (int i = 0; i < tagValues.size(); i++) { const QString childKvp = tagKey % "=" % tagValues[i]; if (OsmSchema::getInstance().isAncestor(childKvp, tagKey % "=" % tagValue)) { lineWithMostSpecificKvp = countStr % "\t" % word % "\t" % childKvp % "\n"; } } LOG_VART(lineWithMostSpecificKvp); _tieResolvedCountFile->write(lineWithMostSpecificKvp.toUtf8()); duplicateResolutions++; } else { const QString updatedLine = countStr % "\t" % word % "\t" % kvp % "\n"; LOG_VART(updatedLine); _tieResolvedCountFile->write(updatedLine.toUtf8()); } lineCount++; if (lineCount % (_statusUpdateInterval * 10) == 0) { PROGRESS_INFO( "Parsed " << StringUtils::formatLargeNumber(lineCount) << " lines from input for duplicated tag key count ties."); } } LOG_VARD(lineCount); LOG_INFO( "Resolved " << StringUtils::formatLargeNumber(duplicateResolutions) << " word/tag key/count ties."); _duplicatedWordTagKeyCountsToValues.clear(); _tieResolvedCountFile->close(); }
void ImplicitTagRawRulesDeriver::_removeDuplicatedKeyTypes() { LOG_INFO("Removing duplicated tag key types from output..."); //i.e. don't allow amenity=school AND amenity=shop to be associated with the same word...pick one //of them _dedupedCountFile.reset( new QTemporaryFile( _tempFileDir + "/implicit-tag-raw-rules-generator-temp-XXXXXX")); _dedupedCountFile->setAutoRemove(!_keepTempFiles); if (!_dedupedCountFile->open()) { throw HootException( QObject::tr("Error opening %1 for writing.").arg(_dedupedCountFile->fileName())); } LOG_DEBUG("Opened dedupe temp file: " << _dedupedCountFile->fileName()); if (_keepTempFiles) { LOG_WARN("Keeping temp file: " << _dedupedCountFile->fileName()); } long lineCount = 0; long writtenLineCount = 0; while (!_sortedCountFile->atEnd()) { const QString line = QString::fromUtf8(_sortedCountFile->readLine().constData()).trimmed(); LOG_VART(line); const QStringList lineParts = line.split("\t"); LOG_VART(lineParts); QString word = lineParts[1].trimmed(); LOG_VART(word); const QString kvp = lineParts[2].trimmed(); LOG_VART(kvp); const QString countStr = lineParts[0].trimmed(); const long count = countStr.toLong(); LOG_VART(count); const QStringList kvpParts = kvp.split("="); const QString tagKey = kvpParts[0]; LOG_VART(tagKey); const QString wordTagKey = word.trimmed() % ";" % tagKey.trimmed(); LOG_VART(wordTagKey); const QString wordTagKeyCount = word.trimmed() % ";" % tagKey.trimmed() % ";" % countStr.trimmed(); LOG_VART(wordTagKeyCount); const QString tagValue = kvpParts[1]; LOG_VART(tagValue); //The lines are sorted in reverse by occurrence count. So the first time we see one word/key //combo, we know it had the highest occurrence count, and we can ignore all subsequent //instances of it since any one feature can't have more than one tag applied to it with the //same key. const QString queriedCountAndValue = _wordKeysToCountsValues.value(wordTagKey, ""); if (queriedCountAndValue.isEmpty()) { _wordKeysToCountsValues[wordTagKey] = countStr % ";" % tagValue; //this unescaping must occur during the final temp file write if (word.contains("%3D")) { word = word.replace("%3D", "="); } else if (word.contains("%3d")) { word = word.replace("%3d", "="); } const QString updatedLine = countStr % "\t" % word % "\t" % kvp % "\n"; LOG_VART(updatedLine); _dedupedCountFile->write(updatedLine.toUtf8()); writtenLineCount++; } else { const long queriedCount = queriedCountAndValue.split(";")[0].toLong(); if (queriedCount == count) { LOG_TRACE( "Recording duplicated word/tag key/count for: " << wordTagKeyCount << " with value: " << tagValue); _duplicatedWordTagKeyCountsToValues[wordTagKeyCount].append(tagValue); } } lineCount++; if (lineCount % (_statusUpdateInterval * 10) == 0) { PROGRESS_INFO( "Parsed " << StringUtils::formatLargeNumber(lineCount) << " lines from input for duplicated tag key removal."); } } _sortedCountFile->close(); LOG_INFO( "Wrote " << StringUtils::formatLargeNumber(writtenLineCount) << " lines to deduped file."); _dedupedCountFile->close(); }
void ImplicitTagRawRulesDeriver::deriveRawRules(const QStringList inputs, const QStringList translationScripts, const QString output) { _validateInputs(inputs, translationScripts, output); LOG_INFO( "Generating implicit tag rules raw file for inputs: " << inputs << ", translation scripts: " << translationScripts << ". Writing to output: " << output << "..."); LOG_VARD(_sortParallelCount); LOG_VARD(_skipFiltering); LOG_VARD(_translateNamesToEnglish); _init(); long eligibleFeatureCount = 0; long totalFeatureCount = 0; for (int i = 0; i < inputs.size(); i++) { boost::shared_ptr<ElementInputStream> inputStream = _getInputStream(inputs.at(i), translationScripts.at(i)); while (inputStream->hasMoreElements()) { ElementPtr element = inputStream->readNextElement(); LOG_VART(element); totalFeatureCount++; assert(_elementCriterion.get()); if (_skipFiltering || _elementCriterion->isSatisfied(element)) { QStringList names = element->getTags().getNames(); assert(!names.isEmpty()); //old_name/former_name generally indicates that an element formerly went by the name, so //not really useful here. if (names.removeAll("old_name") > 0) { LOG_VART("Removed old name tag."); } if (names.removeAll("former_name") > 0) { LOG_VART("Removed former name tag."); } assert(!names.isEmpty()); if (_translateNamesToEnglish) { names = ImplicitTagUtils::translateNamesToEnglish(names, element->getTags(), _translator); } LOG_VART(names); //get back only the tags that we'd be interested in applying to future elements implicitly //based on name const QStringList kvps = _elementCriterion->getEligibleKvps(element->getTags()); assert(!kvps.isEmpty()); if (kvps.isEmpty()) { throw HootException("Kvps empty."); } //parse whole names and token groups _parseNames(names, kvps); eligibleFeatureCount++; if (eligibleFeatureCount % _statusUpdateInterval == 0) { PROGRESS_INFO( "Parsed " << StringUtils::formatLargeNumber(eligibleFeatureCount) << " eligible features / " << StringUtils::formatLargeNumber(totalFeatureCount) << " total features."); } } } _inputReader->finalizePartial(); } _countFile->close(); LOG_INFO( "Parsed " << StringUtils::formatLargeNumber(eligibleFeatureCount) << " eligible features from " << StringUtils::formatLargeNumber(totalFeatureCount) << " total features."); LOG_INFO( "Wrote " << StringUtils::formatLargeNumber(_countFileLineCtr) << " lines to count file."); _sortByTagOccurrence(); //sort in descending count order _removeDuplicatedKeyTypes(); bool tieCountsNeededResolved = false; if (_duplicatedWordTagKeyCountsToValues.size() > 0) { _resolveCountTies(); tieCountsNeededResolved = true; } LOG_INFO( "Extracted " << StringUtils::formatLargeNumber(_wordKeysToCountsValues.size()) << " word/tag associations."); LOG_INFO("Clearing word/tag associations..."); _wordKeysToCountsValues.clear(); if (tieCountsNeededResolved) { _sortByWord(_tieResolvedCountFile); } else { _sortByWord(_dedupedCountFile); } }
double AttributeComparator::compareMaps() { _updateBounds(); double scoreSum = 0.0; double buffer = 10.0; double oldIsACost = OsmSchema::getInstance().getIsACost(); OsmSchema::getInstance().setIsACost(0.5); vector<double> scores; // sampled standard deviation _s = -1; // 1.645 for 90% confidence, 1.96 for 95% confidence, and 2.58 for 99% confidence. double zalpha = 1.645; _ci = -1; boost::shared_ptr<OsmMap> referenceMap, otherMap; // do this a bunch of times for (int i = 0; i < _iterations * 4 && (int)scores.size() < _iterations; i++) { // generate a random source point _r.x = Random::instance()->generateUniform() * (_projectedBounds.MaxX - _projectedBounds.MinX) + _projectedBounds.MinX; _r.y = Random::instance()->generateUniform() * (_projectedBounds.MaxY - _projectedBounds.MinY) + _projectedBounds.MinY; // pick one map as the reference map if (Random::instance()->coinToss()) { referenceMap = _mapP1; otherMap = _mapP2; } else { referenceMap = _mapP2; otherMap = _mapP1; } // find the nearest way on the reference map vector<long> wids1 = referenceMap->getIndex().findWayNeighbors(_r, buffer); vector<long> wids2 = otherMap->getIndex().findWayNeighbors(_r, buffer); Tags t1, t2; double bestScore = -1.0; for (size_t j = 0; j < wids1.size(); j++) { WayPtr w1 = referenceMap->getWay(wids1[j]); for (size_t k = 0; k < wids2.size(); k++) { WayPtr w2 = otherMap->getWay(wids2[k]); double score = TagComparator::getInstance().compareTags(w1->getTags(), w2->getTags()); if (score > bestScore) { bestScore = score; t1 = w1->getTags(); t2 = w2->getTags(); } } } if (bestScore >= 0.0) { // LOG_INFO("===="); // LOG_INFO("score: " << bestScore); // LOG_INFO("t1: \n" << t1); // LOG_INFO("t2: \n" << t2); scoreSum += bestScore; scores.push_back(bestScore); sort(scores.begin(), scores.end()); _median = scores[scores.size() / 2]; _mean = scoreSum / (double)scores.size(); } if (scores.size() > 1) { double v = 0; for (size_t i = 0; i < scores.size(); i++) { v += (scores[i] - _mean) * (scores[i] - _mean); } _s = sqrt(v / (scores.size() - 1)); _ci = zalpha * _s / sqrt(scores.size()); } PROGRESS_INFO(i << " / " << _iterations << " mean: " << _mean << " "); } LOG_INFO(_iterations << " / " << _iterations << " mean: " << _mean << " "); OsmSchema::getInstance().setIsACost(oldIsACost); return _mean; }
void ImplicitTagRulesDatabaseDeriver::_applyFiltering(const QString input) { LOG_INFO("Applying word/tag/rule filtering to output..."); _filteredCountFile.reset( new QTemporaryFile( ConfigOptions().getApidbBulkInserterTempFileDir() + "/implicit-tag-rules-deriver-temp-XXXXXX")); _filteredCountFile->setAutoRemove(!ConfigOptions().getImplicitTaggingKeepTempFiles()); if (!_filteredCountFile->open()) { throw HootException( QObject::tr("Error opening %1 for writing.").arg(_filteredCountFile->fileName())); } LOG_DEBUG("Opened filtered temp file: " << _filteredCountFile->fileName()); if (ConfigOptions().getImplicitTaggingKeepTempFiles()) { LOG_WARN("Keeping temp file: " << _filteredCountFile->fileName()); } QFile inputFile(input); if (!inputFile.open(QIODevice::ReadOnly)) { throw HootException(QObject::tr("Error opening %1 for reading.").arg(input)); } LOG_DEBUG("Opened input file: " << input); long linesParsedCount = 0; long linesWrittenCount = 0; long wordsTooSmallCount = 0; long ignoredWordsCount = 0; long ignoredTagsCount = 0; long ignoredRuleCountDueToCustomRules = 0; long wordNotASchemaValueCount = 0; while (!inputFile.atEnd()) { const QString line = QString::fromUtf8(inputFile.readLine().constData()).trimmed(); LOG_VART(line); const QStringList lineParts = line.split("\t"); LOG_VART(lineParts); QString word = lineParts[1].trimmed(); LOG_VART(word); //this won't come back true unless _useSchemaTagValuesForWordsOnly = true. const bool wordNotASchemaTagValue = _wordIsNotASchemaTagValue(word); const bool wordTooSmall = word.length() < _minWordLength; //Skip the word if we already have a custom rule that is associated with it (they're applied //to the database after this filtering). if (!wordTooSmall && !_customRules.getWordIgnoreList().contains(word, Qt::CaseInsensitive) && !wordNotASchemaTagValue) { const QString kvp = lineParts[2].trimmed(); LOG_VART(kvp); const QString tagKey = kvp.split("=")[0]; LOG_VART(tagKey); const QString keyWildCard = tagKey % "=*"; LOG_VART(keyWildCard); const QStringList tagIgnoreList = _customRules.getTagIgnoreList(); const bool ignoreTag = !tagIgnoreList.isEmpty() && (tagIgnoreList.contains(kvp) || tagIgnoreList.contains(keyWildCard)); LOG_VART(ignoreTag); if (!ignoreTag) { const QString customRuleTag = _customRules.getCustomRulesList().value(word.toLower(), ""); if (customRuleTag == kvp) { LOG_TRACE( "Skipping word/tag combo on custom rule list. Word: " << word << ", tag: " << kvp << "."); ignoredRuleCountDueToCustomRules++; } else { //write the valid count line const long count = lineParts[0].trimmed().toLong(); LOG_VART(count); const QString line = QString::number(count) % "\t" % word % "\t" % kvp % "\n"; LOG_VART(line); _filteredCountFile->write(line.toUtf8()); linesWrittenCount++; } } else { if (ignoreTag) { LOG_TRACE("Skipping tag on the ignore list: " << kvp << "."); } else { LOG_TRACE("Skipping tag not on the include list: " << kvp << "."); } ignoredTagsCount++; } } else { if (wordTooSmall) { LOG_TRACE( "Skipping word: " << word << ", the length of which is less than the minimum allowed word length of: " << _minWordLength); wordsTooSmallCount++; } else if (wordNotASchemaTagValue) { LOG_TRACE( "Schema tag value requirement for word is being enforced and word is not a schema " << "tag value: " << word.toLower() << "."); _wordsNotInSchema.insert(word.toLower()); wordNotASchemaValueCount++; } else { LOG_TRACE("Skipping word on the ignore list: " << word << "."); ignoredWordsCount++; } } linesParsedCount++; if (linesParsedCount % (_statusUpdateInterval * 100) == 0) { PROGRESS_INFO( "Filtered " << StringUtils::formatLargeNumber(linesParsedCount) << " count file lines from input."); } } inputFile.close(); LOG_INFO("Parsed " << StringUtils::formatLargeNumber(linesParsedCount) << " words."); LOG_INFO( "Skipped " << StringUtils::formatLargeNumber(wordsTooSmallCount) << " words that were too small."); LOG_INFO("Ignored " << StringUtils::formatLargeNumber(ignoredWordsCount) << " words."); LOG_INFO("Ignored " << StringUtils::formatLargeNumber(ignoredTagsCount) << " tags."); LOG_INFO( "Ignored " << StringUtils::formatLargeNumber(ignoredRuleCountDueToCustomRules) << " rules due to them overlapping with the custom rules list."); LOG_INFO( "Skipped " << StringUtils::formatLargeNumber(wordNotASchemaValueCount) << " words that were not a schema value."); if (wordNotASchemaValueCount > 0) { QStringList wordsNotInSchemaList = _wordsNotInSchema.toList(); qSort(wordsNotInSchemaList.begin(), wordsNotInSchemaList.end()); LOG_VART(wordsNotInSchemaList); } //technically this could be done outside of this filtering... _writeCustomRules(linesWrittenCount); LOG_INFO( "Wrote " << StringUtils::formatLargeNumber(linesWrittenCount) << " / " << StringUtils::formatLargeNumber(linesParsedCount) << " lines to filtered file."); _filteredCountFile->close(); }