Esempio n. 1
0
void ToEnglishTranslationVisitor::visit(const boost::shared_ptr<Element>& e)
{
  if (_tagKeys.isEmpty())
  {
    throw HootException("No tag keys specified for language translation.");
  }

  LOG_VART(e);

  //if this var was set while parsing the previous element, increment the counter now
  if (_currentElementHasSuccessfulTagTranslation)
  {
    _numElementsWithSuccessfulTagTranslation++;
  }
  _currentElementHasSuccessfulTagTranslation = false;

  const Tags& tags = e->getTags();
  bool elementProcessed = false;
  for (QSet<QString>::const_iterator tagKeysItr = _tagKeys.begin();
       tagKeysItr != _tagKeys.end(); ++tagKeysItr)
  {
    const QString toTranslateTagKey = *tagKeysItr;
    if (tags.contains(toTranslateTagKey))
    {     
      //making skipping tags that already have an english translated tag optional, b/c a many of the
      //OSM english translations I've seen are either just copies of the foreign language text or are
      //not very good translations
      const QString preTranslatedTagKey = toTranslateTagKey + ":en";
      if (!_ignorePreTranslatedTags && tags.contains(preTranslatedTagKey))
      {
        LOG_TRACE(
          "Skipping element with pre-translated tag: " << preTranslatedTagKey << "=" <<
          tags.get(toTranslateTagKey).trimmed());
      }
      else
      {
        _translate(e, toTranslateTagKey);
        elementProcessed = true;
      }
    }
  }

  if (elementProcessed)
  {
    _numProcessedElements++;
    if (_numProcessedElements % _taskStatusUpdateInterval == 0)
    {
      PROGRESS_INFO("Attempted tag translation for " << _numProcessedElements << " elements.");
    }
  }

  _numTotalElements++;
  if (_numTotalElements % _taskStatusUpdateInterval == 0)
  {
    PROGRESS_INFO("Visited " << _numTotalElements << " elements.");
  }
}
Esempio n. 2
0
void ImplicitTagRawRulesDeriver::_resolveCountTies()
{
  //Any time more than one word/key combo has the same occurrence count, we need to pick just one
  //of them.

  LOG_INFO(
    "Resolving word/tag key/count ties for " <<
    StringUtils::formatLargeNumber(_duplicatedWordTagKeyCountsToValues.size()) <<
    " duplicated word/tag key/counts...");

  _tieResolvedCountFile.reset(
    new QTemporaryFile(
      _tempFileDir + "/implicit-tag-raw-rules-generator-temp-XXXXXX"));
  _tieResolvedCountFile->setAutoRemove(!_keepTempFiles);
  if (!_tieResolvedCountFile->open())
  {
    throw HootException(
      QObject::tr("Error opening %1 for writing.").arg(_tieResolvedCountFile->fileName()));
  }
  LOG_DEBUG("Opened tie resolve temp file: " << _tieResolvedCountFile->fileName());
  if (_keepTempFiles)
  {
    LOG_WARN("Keeping temp file: " << _tieResolvedCountFile->fileName());
  }
  if (!_dedupedCountFile->open())
  {
    throw HootException(
      QObject::tr("Error opening %1 for reading.").arg(_dedupedCountFile->fileName()));
  }

  long lineCount = 0;
  long duplicateResolutions = 0;
  while (!_dedupedCountFile->atEnd())
  {
    const QString line = QString::fromUtf8(_dedupedCountFile->readLine().constData()).trimmed();
    LOG_VART(line);
    const QStringList lineParts = line.split("\t");
    LOG_VART(lineParts);
    QString word = lineParts[1].trimmed();
    LOG_VART(word);
    const QString kvp = lineParts[2].trimmed();
    LOG_VART(kvp);
    const QString countStr = lineParts[0].trimmed();
    const long count = countStr.toLong();
    LOG_VART(count);
    const QStringList kvpParts = kvp.split("=");
    const QString tagKey = kvpParts[0];
    LOG_VART(tagKey);
    const QString wordTagKey = word.trimmed() % ";" % tagKey.trimmed();
    LOG_VART(wordTagKey);
    const QString wordTagKeyCount =
      word.trimmed() % ";" % tagKey.trimmed() % ";" % countStr.trimmed();
    LOG_VART(wordTagKeyCount);
    const QString tagValue = kvpParts[1];
    LOG_VART(tagValue);

    if (_duplicatedWordTagKeyCountsToValues.contains(wordTagKeyCount))
    {
      LOG_TRACE("Resolving duplicated word/tag key/count for " << wordTagKeyCount << "...");

      //To resolve the tie, we're going to pick the most specific kvp.  e.g. amenity=public_hall
      //wins out of amenity=hall.  This is not really dealing with same hierarchy level tags
      //(e.g. amenity=school and amenity=hall) and will just arbitrarily pick in that situation.
      //Duplicates do seem to be fairly rare, but there could be some perfomance gains by coming
      //up with a better way to handle this situation.
      QString lineWithMostSpecificKvp = line % "\n";
      const QStringList tagValues = _duplicatedWordTagKeyCountsToValues[wordTagKeyCount];
      for (int i = 0; i < tagValues.size(); i++)
      {
        const QString childKvp = tagKey % "=" % tagValues[i];
        if (OsmSchema::getInstance().isAncestor(childKvp, tagKey % "=" % tagValue))
        {
          lineWithMostSpecificKvp = countStr % "\t" % word % "\t" % childKvp % "\n";
        }
      }
      LOG_VART(lineWithMostSpecificKvp);
      _tieResolvedCountFile->write(lineWithMostSpecificKvp.toUtf8());
      duplicateResolutions++;
    }
    else
    {
      const QString updatedLine = countStr % "\t" % word % "\t" % kvp % "\n";
      LOG_VART(updatedLine);
      _tieResolvedCountFile->write(updatedLine.toUtf8());
    }

    lineCount++;
    if (lineCount % (_statusUpdateInterval * 10) == 0)
    {
      PROGRESS_INFO(
        "Parsed " << StringUtils::formatLargeNumber(lineCount) <<
        " lines from input for duplicated tag key count ties.");
    }
  }
  LOG_VARD(lineCount);
  LOG_INFO(
    "Resolved " << StringUtils::formatLargeNumber(duplicateResolutions) <<
    " word/tag key/count ties.");
  _duplicatedWordTagKeyCountsToValues.clear();
  _tieResolvedCountFile->close();
}
Esempio n. 3
0
void ImplicitTagRawRulesDeriver::_removeDuplicatedKeyTypes()
{
  LOG_INFO("Removing duplicated tag key types from output...");

  //i.e. don't allow amenity=school AND amenity=shop to be associated with the same word...pick one
  //of them

  _dedupedCountFile.reset(
    new QTemporaryFile(
      _tempFileDir + "/implicit-tag-raw-rules-generator-temp-XXXXXX"));
  _dedupedCountFile->setAutoRemove(!_keepTempFiles);
  if (!_dedupedCountFile->open())
  {
    throw HootException(
      QObject::tr("Error opening %1 for writing.").arg(_dedupedCountFile->fileName()));
  }
  LOG_DEBUG("Opened dedupe temp file: " << _dedupedCountFile->fileName());
  if (_keepTempFiles)
  {
    LOG_WARN("Keeping temp file: " << _dedupedCountFile->fileName());
  }

  long lineCount = 0;
  long writtenLineCount = 0;
  while (!_sortedCountFile->atEnd())
  {
    const QString line = QString::fromUtf8(_sortedCountFile->readLine().constData()).trimmed();
    LOG_VART(line);
    const QStringList lineParts = line.split("\t");
    LOG_VART(lineParts);
    QString word = lineParts[1].trimmed();
    LOG_VART(word);
    const QString kvp = lineParts[2].trimmed();
    LOG_VART(kvp);
    const QString countStr = lineParts[0].trimmed();
    const long count = countStr.toLong();
    LOG_VART(count);
    const QStringList kvpParts = kvp.split("=");
    const QString tagKey = kvpParts[0];
    LOG_VART(tagKey);
    const QString wordTagKey = word.trimmed() % ";" % tagKey.trimmed();
    LOG_VART(wordTagKey);
    const QString wordTagKeyCount =
      word.trimmed() % ";" % tagKey.trimmed() % ";" % countStr.trimmed();
    LOG_VART(wordTagKeyCount);
    const QString tagValue = kvpParts[1];
    LOG_VART(tagValue);

    //The lines are sorted in reverse by occurrence count.  So the first time we see one word/key
    //combo, we know it had the highest occurrence count, and we can ignore all subsequent
    //instances of it since any one feature can't have more than one tag applied to it with the
    //same key.

    const QString queriedCountAndValue = _wordKeysToCountsValues.value(wordTagKey, "");
    if (queriedCountAndValue.isEmpty())
    {
      _wordKeysToCountsValues[wordTagKey] = countStr % ";" % tagValue;
      //this unescaping must occur during the final temp file write
      if (word.contains("%3D"))
      {
        word = word.replace("%3D", "=");
      }
      else if (word.contains("%3d"))
      {
        word = word.replace("%3d", "=");
      }
      const QString updatedLine = countStr % "\t" % word % "\t" % kvp % "\n";
      LOG_VART(updatedLine);
      _dedupedCountFile->write(updatedLine.toUtf8());
      writtenLineCount++;
    }
    else
    {
      const long queriedCount = queriedCountAndValue.split(";")[0].toLong();
      if (queriedCount == count)
      {
        LOG_TRACE(
          "Recording duplicated word/tag key/count for: " << wordTagKeyCount << " with value: " <<
          tagValue);
        _duplicatedWordTagKeyCountsToValues[wordTagKeyCount].append(tagValue);
      }
    }

    lineCount++;
    if (lineCount % (_statusUpdateInterval * 10) == 0)
    {
      PROGRESS_INFO(
        "Parsed " << StringUtils::formatLargeNumber(lineCount) <<
        " lines from input for duplicated tag key removal.");
    }
  }
  _sortedCountFile->close();
  LOG_INFO(
    "Wrote " << StringUtils::formatLargeNumber(writtenLineCount) << " lines to deduped file.");
  _dedupedCountFile->close();
}
Esempio n. 4
0
void ImplicitTagRawRulesDeriver::deriveRawRules(const QStringList inputs,
                                                const QStringList translationScripts,
                                                const QString output)
{
  _validateInputs(inputs, translationScripts, output);

  LOG_INFO(
    "Generating implicit tag rules raw file for inputs: " << inputs <<
    ", translation scripts: " << translationScripts << ".  Writing to output: " << output << "...");
  LOG_VARD(_sortParallelCount);
  LOG_VARD(_skipFiltering);
  LOG_VARD(_translateNamesToEnglish);

  _init();

  long eligibleFeatureCount = 0;
  long totalFeatureCount = 0;
  for (int i = 0; i < inputs.size(); i++)
  {
    boost::shared_ptr<ElementInputStream> inputStream =
      _getInputStream(inputs.at(i), translationScripts.at(i));
    while (inputStream->hasMoreElements())
    {
      ElementPtr element = inputStream->readNextElement();
      LOG_VART(element);

      totalFeatureCount++;

      assert(_elementCriterion.get());
      if (_skipFiltering || _elementCriterion->isSatisfied(element))
      {
        QStringList names = element->getTags().getNames();
        assert(!names.isEmpty());

        //old_name/former_name generally indicates that an element formerly went by the name, so
        //not really useful here.
        if (names.removeAll("old_name") > 0)
        {
          LOG_VART("Removed old name tag.");
        }
        if (names.removeAll("former_name") > 0)
        {
          LOG_VART("Removed former name tag.");
        }
        assert(!names.isEmpty());

        if (_translateNamesToEnglish)
        {
          names = ImplicitTagUtils::translateNamesToEnglish(names, element->getTags(), _translator);
        }
        LOG_VART(names);

        //get back only the tags that we'd be interested in applying to future elements implicitly
        //based on name
        const QStringList kvps = _elementCriterion->getEligibleKvps(element->getTags());
        assert(!kvps.isEmpty());
        if (kvps.isEmpty())
        {
          throw HootException("Kvps empty.");
        }

        //parse whole names and token groups
        _parseNames(names, kvps);

        eligibleFeatureCount++;

        if (eligibleFeatureCount % _statusUpdateInterval == 0)
        {
          PROGRESS_INFO(
            "Parsed " << StringUtils::formatLargeNumber(eligibleFeatureCount) <<
            " eligible features / " << StringUtils::formatLargeNumber(totalFeatureCount) <<
            " total features.");
        }
      }
    }
    _inputReader->finalizePartial();
  }
  _countFile->close();

  LOG_INFO(
    "Parsed " << StringUtils::formatLargeNumber(eligibleFeatureCount) <<
    " eligible features from " << StringUtils::formatLargeNumber(totalFeatureCount) <<
    " total features.");
  LOG_INFO(
    "Wrote " << StringUtils::formatLargeNumber(_countFileLineCtr) << " lines to count file.");

  _sortByTagOccurrence();   //sort in descending count order
  _removeDuplicatedKeyTypes();
  bool tieCountsNeededResolved = false;
  if (_duplicatedWordTagKeyCountsToValues.size() > 0)
  {
    _resolveCountTies();
    tieCountsNeededResolved = true;
  }
  LOG_INFO(
    "Extracted "  << StringUtils::formatLargeNumber(_wordKeysToCountsValues.size()) <<
    " word/tag associations.");
  LOG_INFO("Clearing word/tag associations...");
  _wordKeysToCountsValues.clear();
  if (tieCountsNeededResolved)
  {
    _sortByWord(_tieResolvedCountFile);
  }
  else
  {
    _sortByWord(_dedupedCountFile);
  }
}
Esempio n. 5
0
double AttributeComparator::compareMaps()
{
  _updateBounds();
  double scoreSum = 0.0;

  double buffer = 10.0;

  double oldIsACost = OsmSchema::getInstance().getIsACost();
  OsmSchema::getInstance().setIsACost(0.5);

  vector<double> scores;
  // sampled standard deviation
  _s = -1;
  // 1.645 for 90% confidence, 1.96 for 95% confidence, and 2.58 for 99% confidence.
  double zalpha = 1.645;
  _ci = -1;

  boost::shared_ptr<OsmMap> referenceMap, otherMap;

  // do this a bunch of times
  for (int i = 0; i < _iterations * 4 && (int)scores.size() < _iterations; i++)
  {
    // generate a random source point
    _r.x = Random::instance()->generateUniform() * (_projectedBounds.MaxX - _projectedBounds.MinX) +
          _projectedBounds.MinX;
    _r.y = Random::instance()->generateUniform() * (_projectedBounds.MaxY - _projectedBounds.MinY) +
          _projectedBounds.MinY;

    // pick one map as the reference map
    if (Random::instance()->coinToss())
    {
      referenceMap = _mapP1;
      otherMap = _mapP2;
    }
    else
    {
      referenceMap = _mapP2;
      otherMap = _mapP1;
    }

    // find the nearest way on the reference map
    vector<long> wids1 = referenceMap->getIndex().findWayNeighbors(_r, buffer);
    vector<long> wids2 = otherMap->getIndex().findWayNeighbors(_r, buffer);

    Tags t1, t2;
    double bestScore = -1.0;
    for (size_t j = 0; j < wids1.size(); j++)
    {
      WayPtr w1 = referenceMap->getWay(wids1[j]);

      for (size_t k = 0; k < wids2.size(); k++)
      {
        WayPtr w2 = otherMap->getWay(wids2[k]);
        double score = TagComparator::getInstance().compareTags(w1->getTags(), w2->getTags());
        if (score > bestScore)
        {
          bestScore = score;
          t1 = w1->getTags();
          t2 = w2->getTags();
        }
      }
    }

    if (bestScore >= 0.0)
    {
//        LOG_INFO("====");
//        LOG_INFO("score: " << bestScore);
//        LOG_INFO("t1: \n" << t1);
//        LOG_INFO("t2: \n" << t2);

      scoreSum += bestScore;
      scores.push_back(bestScore);
      sort(scores.begin(), scores.end());
      _median = scores[scores.size() / 2];
      _mean = scoreSum / (double)scores.size();
    }

    if (scores.size() > 1)
    {
      double v = 0;
      for (size_t i = 0; i < scores.size(); i++)
      {
        v += (scores[i] - _mean) * (scores[i] - _mean);
      }
      _s = sqrt(v / (scores.size() - 1));

      _ci = zalpha * _s / sqrt(scores.size());
    }

    PROGRESS_INFO(i << " / " << _iterations << " mean: " << _mean << "   ");
  }

  LOG_INFO(_iterations << " / " << _iterations << " mean: " << _mean << "   ");

  OsmSchema::getInstance().setIsACost(oldIsACost);

  return _mean;
}
Esempio n. 6
0
void ImplicitTagRulesDatabaseDeriver::_applyFiltering(const QString input)
{
  LOG_INFO("Applying word/tag/rule filtering to output...");

  _filteredCountFile.reset(
    new QTemporaryFile(
      ConfigOptions().getApidbBulkInserterTempFileDir() +
      "/implicit-tag-rules-deriver-temp-XXXXXX"));
  _filteredCountFile->setAutoRemove(!ConfigOptions().getImplicitTaggingKeepTempFiles());
  if (!_filteredCountFile->open())
  {
    throw HootException(
      QObject::tr("Error opening %1 for writing.").arg(_filteredCountFile->fileName()));
  }
  LOG_DEBUG("Opened filtered temp file: " << _filteredCountFile->fileName());
  if (ConfigOptions().getImplicitTaggingKeepTempFiles())
  {
    LOG_WARN("Keeping temp file: " << _filteredCountFile->fileName());
  }
  QFile inputFile(input);
  if (!inputFile.open(QIODevice::ReadOnly))
  {
    throw HootException(QObject::tr("Error opening %1 for reading.").arg(input));
  }
  LOG_DEBUG("Opened input file: " << input);

  long linesParsedCount = 0;
  long linesWrittenCount = 0;
  long wordsTooSmallCount = 0;
  long ignoredWordsCount = 0;
  long ignoredTagsCount = 0;
  long ignoredRuleCountDueToCustomRules = 0;
  long wordNotASchemaValueCount = 0;

  while (!inputFile.atEnd())
  {
    const QString line = QString::fromUtf8(inputFile.readLine().constData()).trimmed();
    LOG_VART(line);
    const QStringList lineParts = line.split("\t");
    LOG_VART(lineParts);
    QString word = lineParts[1].trimmed();
    LOG_VART(word);

    //this won't come back true unless _useSchemaTagValuesForWordsOnly = true.
    const bool wordNotASchemaTagValue = _wordIsNotASchemaTagValue(word);

    const bool wordTooSmall = word.length() < _minWordLength;

    //Skip the word if we already have a custom rule that is associated with it (they're applied
    //to the database after this filtering).
    if (!wordTooSmall && !_customRules.getWordIgnoreList().contains(word, Qt::CaseInsensitive) &&
        !wordNotASchemaTagValue)
    {
      const QString kvp = lineParts[2].trimmed();
      LOG_VART(kvp);
      const QString tagKey = kvp.split("=")[0];
      LOG_VART(tagKey);
      const QString keyWildCard = tagKey % "=*";
      LOG_VART(keyWildCard);

      const QStringList tagIgnoreList = _customRules.getTagIgnoreList();
      const bool ignoreTag =
        !tagIgnoreList.isEmpty() &&
        (tagIgnoreList.contains(kvp) || tagIgnoreList.contains(keyWildCard));
      LOG_VART(ignoreTag);

      if (!ignoreTag)
      {
        const QString customRuleTag = _customRules.getCustomRulesList().value(word.toLower(), "");
        if (customRuleTag == kvp)
        {
          LOG_TRACE(
            "Skipping word/tag combo on custom rule list.  Word: " << word << ", tag: " <<
            kvp << ".");
          ignoredRuleCountDueToCustomRules++;
        }
        else
        {
          //write the valid count line
          const long count = lineParts[0].trimmed().toLong();
          LOG_VART(count);
          const QString line = QString::number(count) % "\t" % word % "\t" % kvp % "\n";
          LOG_VART(line);
          _filteredCountFile->write(line.toUtf8());
          linesWrittenCount++;
        }
      }
      else
      {
        if (ignoreTag)
        {
          LOG_TRACE("Skipping tag on the ignore list: " << kvp << ".");
        }
        else
        {
          LOG_TRACE("Skipping tag not on the include list: " << kvp << ".");
        }
        ignoredTagsCount++;
      }
    }
    else
    {
      if (wordTooSmall)
      {
        LOG_TRACE(
          "Skipping word: " << word <<
          ", the length of which is less than the minimum allowed word length of: " <<
          _minWordLength);
        wordsTooSmallCount++;
      }
      else if (wordNotASchemaTagValue)
      {
        LOG_TRACE(
          "Schema tag value requirement for word is being enforced and word is not a schema " <<
          "tag value: " << word.toLower() << ".");
        _wordsNotInSchema.insert(word.toLower());
        wordNotASchemaValueCount++;
      }
      else
      {
        LOG_TRACE("Skipping word on the ignore list: " << word << ".");
        ignoredWordsCount++;
      }
    }

    linesParsedCount++;
    if (linesParsedCount % (_statusUpdateInterval * 100) == 0)
    {
      PROGRESS_INFO(
        "Filtered " << StringUtils::formatLargeNumber(linesParsedCount) <<
        " count file lines from input.");
    }
  }
  inputFile.close();

  LOG_INFO("Parsed " << StringUtils::formatLargeNumber(linesParsedCount) << " words.");
  LOG_INFO(
    "Skipped " << StringUtils::formatLargeNumber(wordsTooSmallCount) <<
    " words that were too small.");
  LOG_INFO("Ignored " << StringUtils::formatLargeNumber(ignoredWordsCount) << " words.");
  LOG_INFO("Ignored " << StringUtils::formatLargeNumber(ignoredTagsCount) << " tags.");
  LOG_INFO(
    "Ignored " << StringUtils::formatLargeNumber(ignoredRuleCountDueToCustomRules) <<
    " rules due to them overlapping with the custom rules list.");
  LOG_INFO(
    "Skipped " << StringUtils::formatLargeNumber(wordNotASchemaValueCount) <<
    " words that were not a schema value.");
  if (wordNotASchemaValueCount > 0)
  {
    QStringList wordsNotInSchemaList = _wordsNotInSchema.toList();
    qSort(wordsNotInSchemaList.begin(), wordsNotInSchemaList.end());
    LOG_VART(wordsNotInSchemaList);
  }

  //technically this could be done outside of this filtering...
  _writeCustomRules(linesWrittenCount);

  LOG_INFO(
    "Wrote " << StringUtils::formatLargeNumber(linesWrittenCount) << " / " <<
     StringUtils::formatLargeNumber(linesParsedCount) << " lines to filtered file.");

  _filteredCountFile->close();
}