void MapStatsMapper::_writeStats(HadoopPipes::MapContext& context, const MapStats& stats) { LOG_INFO("node count: " << _nodeCount); LOG_INFO("way count: " << _wayCount); if (stats.isValid()) { pp::Hdfs fs; int partition = context.getJobConf()->getInt("mapred.task.partition"); string workDir = context.getJobConf()->get("mapred.work.output.dir"); LOG_INFO("Stats: " << stats.toString()); QString path = QString("%1/part-%2.stats"). arg(QString::fromStdString(workDir)). arg(partition, 5, 10, QChar('0')); LOG_INFO("Writing to: " << path); boost::shared_ptr<ostream> osStats(fs.create(path.toStdString())); stats.write(*osStats); } else { LOG_INFO("Stats are not valid."); } }
void map(HadoopPipes::MapContext& context) { std::string line = context.getInputValue(); std::string year = line.substr(15, 4); std::string airTemperature = line.substr(87, 5); std::string q = line.substr(92, 1); if (airTemperature != "+9999" && (q == "0" || q == "1" || q == "4" || q == "5" || q == "9")) { context.emit(year, airTemperature); } }
/* * 1.exec segment usage calc * 2.emit (segno,seg_usage_text) */ void map(HadoopPipes::MapContext& context) { int ret = 0; printf("DBG:-- enter func:%s\n",__func__); const char *segfile = context.getInputKey().data(); printf("DBG:-- key len :%d ,segfile:%s\n",context.getInputValue().size(),segfile); uint64_t segno = get_segfile_no(segfile); HADOOP_ASSERT(segfile != NULL, "failed read segfile "); GHashTable *ss_hashtable = g_hash_table_new_full(g_str_hash,g_str_equal,NULL,NULL); ret = load_all_snapshot(m_storage,SNAPSHOT_FILE,ss_hashtable); printf("DBG:-- snapshot loaded\n"); g_assert(ret == 0); GList* ss_list = NULL; ret = sort_all_snapshot(ss_hashtable,&ss_list); printf("DBG:--snapshot sorted\n"); g_assert(ss_list !=NULL); g_assert(ret == 0); //struct inode * latest_inode = load_latest_inode(storage); struct inode * inode=NULL; char *up_sname; ret = get_refer_inode_between_snapshots(m_storage,segno,ss_list,&inode,&up_sname); SEG_USAGE_T seg_usage; memset(&seg_usage,0,sizeof(SEG_USAGE_T)); if(ret == 0){ printf("DBG:--seg is in snapshots\n"); strncpy(seg_usage.up_sname,up_sname,strlen(up_sname)); ret = seg_usage_calc(m_storage,m_block_size,segno,inode,&seg_usage); printf("up sname is:%s\n",seg_usage.up_sname); g_assert(ret ==0); } if(ret == 1){ printf("DBG:--seg is on snapshot,do nothing\n"); } if(ret == 2){ printf("DBG:--seg is above snapshot,maybe need migrate\n"); strncpy(seg_usage.up_sname,EMPTY_UP_SNAPSHOT,strlen(EMPTY_UP_SNAPSHOT)); printf("DBG:--up sname is:%s\n",seg_usage.up_sname); inode = load_latest_inode(m_storage); ret = seg_usage_calc(m_storage,m_block_size,segno,inode,&seg_usage); g_assert(ret ==0); } #if 1 string key =string(segfile,strlen(segfile)); char segtextbuf[4096]; uint32_t len = seg_usage2text(&seg_usage,segtextbuf); printf("DBG:--segtextbuf :%s ..\n",segtextbuf); string value = string(segtextbuf,len); printf("DBG:--send segment usage text to reducer ..\n"); context.emit(key,value); #endif g_free(seg_usage.bitmap); }
// map function: receives a line, outputs (word,"1") // to reducer. void map( HadoopPipes::MapContext& context ) { //--- get line of text --- string line = context.getInputValue(); //--- split it into words --- vector< string > words = HadoopUtils::splitString( line, " " ); //--- emit each word tuple (word, "1" ) --- for ( unsigned int i=0; i < words.size(); i++ ) { context.emit( words[i], HadoopUtils::toString( 1 ) ); } }
//map function: receives a line, outputs (byteOffset, upper(line)) //byte offset is monotonically rising, so sorting will be achieved void map(HadoopPipes::MapContext& context) { //get line of text string line = context.getInputValue(); //transform to uppercase string::iterator it = line.begin(); while (it != line.end()) { if ('a' <= *it && *it <= 'z') { *it += 'A' - 'a'; } ++it; } //emit context.emit("", line); }
void map(HadoopPipes::MapContext& context) { line = context.getInputValue(); size_t found = line.find_first_of("STN---"); if (found != std::string::npos) return; year = DataSet::getYear(line); airTemperature = DataSet::getMax(line); if (airTemperature != DataSet::MISSING) { valStr.str(""); valStr << airTemperature; context.emit(year, valStr.str()); } }
void WayJoin1Mapper::_map(shared_ptr<OsmMap>& m, HadoopPipes::MapContext& context) { LOG_INFO("Starting map"); string keyStr; string valueStr; keyStr.resize(sizeof(int64_t)); int64_t* key = (int64_t*)keyStr.data(); // Remove all non-roads. m->removeWays(TagFilter(Filter::FilterMatches, "highway", "")); Debug::printTroubled(m); // emit the node's ID as the key and x/y as the value. valueStr.resize(sizeof(ValueNode)); ValueNode* valueNode = (ValueNode*)valueStr.data(); const OsmMap::NodeMap& nm = m->getNodeMap(); for (OsmMap::NodeMap::const_iterator it = nm.constBegin(); it != nm.constEnd(); ++it) { const shared_ptr<const Node>& n = it.value(); *key = n->getId(); valueNode->x = n->getX(); valueNode->y = n->getY(); context.emit(keyStr, valueStr); } // emit the way's nodes as the key and the way's id as the value. valueStr.resize(sizeof(ValueWay)); ValueWay* valueWay = (ValueWay*)valueStr.data(); const WayMap& wm = m->getWays(); for (WayMap::const_iterator it = wm.begin(); it != wm.end(); ++it) { const shared_ptr<const Way>& w = it->second; valueWay->id = w->getId(); const std::vector<long>& nids = w->getNodeIds(); for (size_t i = 0; i < nids.size(); i++) { *key = nids[i]; context.emit(keyStr, valueStr); } } }
SegUsageCalcReader(HadoopPipes::MapContext& context) { std::string _filename; /* FIXIT : hardcore for get segfile name from hadoop proctocol ? */ int16_t mysize = *(int16_t*)context.getInputSplit().data(); _filename = context.getInputSplit().data()+2; printf("GDB:-- filename :%s sizeof:%d\n",_filename.c_str(),_filename.size()); uint64_t _offset = *(int64_t*)(context.getInputSplit().data()+ 2 +_filename.size()); uint64_t offset = Swap64(_offset); uint64_t _len = *(int64_t*)(context.getInputSplit().data()+2+_filename.size()+8); uint64_t len = Swap64(_len); printf("GDB:-- seg offset:%lld len:%lld\n",offset,len); std::string filename = _filename.data()+5; printf("GDB:-- filename :%s sizeof:%d\n",filename.c_str(),filename.size()); if(TRUE!=g_str_has_suffix(filename.c_str(),"seg")){ printf("GDB:-- ignore it \n"); m_bytes_total = m_bytes_read = 0; return; } m_seg_file = g_strdup(g_basename(filename.c_str())); printf("GDB:-- seg file:%s\n",m_seg_file); m_bytes_total = len; m_bytes_read = 0; }
void WayJoin2Mapper::map(HadoopPipes::MapContext& context) { _context = &context; if (_reader == NULL) { HadoopPipes::RecordReader* rr = pp::HadoopPipesUtils::getRecordReader(&context); _reader = dynamic_cast<WayJoin2RecordReader*>(rr); if (_reader == NULL) { throw InternalErrorException("Expected a record reader of type WayJoin2RecordReader"); } } if (_reader->getRecordType() == WayJoin2InputSplit::PbfInputSplitType) { mapOsmMap(_reader->getMap()); } else { const string& key = context.getInputKey(); const string& value = context.getInputValue(); if (key.size() != sizeof(int64_t)) { throw InternalErrorException("Expected key to be a int64_t"); } if (value.size() != sizeof(WayJoin1Reducer::Value)) { throw InternalErrorException("Expected value to be a WayJoin1Reducer::Value"); } int64_t* k = (int64_t*)key.data(); WayJoin1Reducer::Value* v = (WayJoin1Reducer::Value*)value.data(); mapWayPoints(*k, *v); } }
void map(HadoopPipes::MapContext& context) { std::vector<nise::HashEntry> v; { std::stringstream ss(context.getInputValue()); nise::ReadVector<nise::HashEntry>(ss, &v); if (!ss) return; } fbi::Hamming hamming; if (v.size() > nise::MAX_HASH) { return; } for (unsigned i = 0; i < v.size(); ++i) { for (unsigned j = 0; j < i; ++j) { if (v[i].second == v[j].second) continue; if (hamming(v[i].first.sketch, v[j].first.sketch) < nise::SKETCH_DIST_OFFLINE) { std::string v1(nise::EncodeUint32(v[i].second)); std::string v2(nise::EncodeUint32(v[j].second)); context.emit(v1, v2); context.emit(v2, v1); } } } }
void map(HadoopPipes::MapContext& context) { string line = context.getInputValue(); string year = line.substr(0, 4); string airTemperature = line.substr(5, 7); context.emit(year, airTemperature); }