int main( int argc, char *argv[] ) { Alignments alignments ; Genome genome ; std::vector<int> rascafFileId ; char line[2048] ; char prefix[512] = "rascaf_scaffold" ; int rawAssemblyInd = 1 ; FILE *rascafFile ; bool contigLevel = false ; int i ; FILE *outputFile ; FILE *infoFile ; breakN = 1 ; if ( argc < 2 ) { fprintf( stderr, "%s", usage ) ; exit( 1 ) ; } for ( i = 1 ; i < argc ; ++i ) { if ( !strcmp( "-o", argv[i] ) ) { strcpy( prefix, argv[i + 1 ] ) ; ++i ; } else if ( !strcmp( "-ms", argv[i] ) ) { minSupport = atoi( argv[i + 1] ) ; ++i ; } else if ( !strcmp( "-ignoreGap", argv[i] ) ) { ignoreGap = true ; } else if ( !strcmp( "-r", argv[i] ) ) { rascafFileId.push_back( i + 1 ) ; ++i ; } else { fprintf( stderr, "Unknown option: %s\n", argv[i] ) ; exit( EXIT_FAILURE ) ; } } if ( rascafFileId.size() == 0 ) { fprintf( stderr, "Must use -r to specify rascaf output file.\n" ) ; exit( EXIT_FAILURE ) ; } MAX_NEIGHBOR = 1 + rascafFileId.size() ; // Get the bam file. rascafFile = fopen( argv[ rascafFileId[0] ], "r" ) ; while ( fgets( line, sizeof( line ), rascafFile ) != NULL ) { if ( strstr( line, "command line:" ) ) { char *p ; char buffer[512] ; p = strstr( line, "-breakN" ) ; if ( p != NULL ) { p += 7 ; while ( *p == ' ' ) ++p ; for ( i = 0 ; *p && *p != ' ' ; ++p, ++i ) buffer[i] = *p ; buffer[i] = '\0' ; breakN = atoi( buffer ) ; } p = strstr( line, "-b" ) ; if ( p == NULL ) { fprintf( stderr, "Could not find the bam file specified by -b in Rascaf.\n" ) ; exit( 1 ) ; } p += 2 ; while ( *p == ' ' ) ++p ; for ( i = 0 ; *p && *p != ' ' ; ++p, ++i ) buffer[i] = *p ; buffer[i] = '\0' ; alignments.Open( buffer ) ; p = strstr( line, "-f") ; if ( p == NULL ) { fprintf( stderr, "Could not find the raw assembly file specified by -f in Rascaf.\n" ) ; exit( 1 ) ; } p += 2 ; while ( *p == ' ' ) ++p ; for ( i = 0 ; *p && *p != ' ' && *p != '\n' ; ++p, ++i ) buffer[i] = *p ; buffer[i] = '\0' ; fprintf( stderr, "Found raw assembly file: %s\n", buffer ) ; genome.Open( alignments, buffer ) ; break ; } } fclose( rascafFile ) ; // Parse the input. for ( unsigned int fid = 0 ; fid < rascafFileId.size() ; ++fid ) { rascafFile = fopen( argv[ rascafFileId[fid] ], "r" ) ; bool start = false ; int tag ; while ( fgets( line, sizeof( line ), rascafFile ) != NULL ) { if ( strstr( line, "command line:" ) ) { start = true ; if ( strstr( line, "-f" ) ) { contigLevel = true ; } continue ; } if ( !start ) continue ; if ( !strcmp( line, "WARNINGS:\n" ) ) break ; std::vector<struct _part> nparts ; if ( line[0] >= '0' && line[0] <= '9' ) { AddConnection( line, alignments, nparts ) ; connects.push_back( nparts ) ; tag = 0 ; } else if ( line[0] == '\t' || line[0] == ' ' ) { // Break the nparts if the support is too low. int num = 0 ; for ( i = 0 ; line[i] < '0' || line[i] > '9' ; ++i ) ; for ( ; line[i] >= '0' && line[i] <= '9' ; ++i ) num = num * 10 + line[i] - '0' ; ++tag ; if ( num < minSupport ) { nparts = connects.back() ; connects.pop_back() ; int size = nparts.size() ; std::vector<struct _part> newNParts ; for ( i = 0 ; i < tag ; ++i ) newNParts.push_back( nparts[i] ) ; if ( newNParts.size() > 1 ) connects.push_back( newNParts ) ; newNParts.clear() ; for ( ; i < size ; ++i ) newNParts.push_back( nparts[i] ) ; if ( newNParts.size() > 1 ) connects.push_back( newNParts ) ; tag = 0 ; } } } fclose( rascafFile ) ; } if ( contigLevel == false ) { genome.SetIsOpen( contigLevel ) ; } // Build the graph int contigCnt = genome.GetContigCount() ; int edgeCnt = 0 ; int csize = connects.size() ; for ( i = 0 ; i < csize ; ++i ) edgeCnt += connects[i].size() ; ContigGraph contigGraph( contigCnt, contigCnt + edgeCnt ) ; for ( i = 0 ; i < contigCnt - 1 ; ++i ) { if ( genome.GetChrIdFromContigId( i ) == genome.GetChrIdFromContigId( i + 1 ) ) { contigGraph.AddEdge( i, 1, i + 1, 0 ) ; } } for ( i = 0 ; i < csize ; ++i ) { std::vector<struct _part> &parts = connects[i] ; int size = parts.size() ; for ( int j = 0 ; j < size - 1 ; ++j ) { struct _part &a = parts[j] ; struct _part &b = parts[j + 1] ; // Two dummy nodes for each contig. Left is 0, right is 1 int dummyU = 0 ; int dummyV = 0 ; if ( a.strand == '+' ) dummyU = 1 ; if ( b.strand == '-' ) dummyV = 1 ; contigGraph.AddEdge( a.contigId, dummyU, b.contigId, dummyV, true ) ; } } // Check the cycles in the contig graph. This may introduces when combining different rascaf outputs. int *visitTime = new int[contigCnt] ; struct _pair *neighbors = new struct _pair[ MAX_NEIGHBOR ] ; bool *isInCycle = new bool[contigCnt] ; std::vector<int> cycleNodes ; memset( visitTime, -1, sizeof( int ) * contigCnt ) ; memset( isInCycle, false, sizeof( bool ) * contigCnt ) ; for ( i = 0 ; i < contigCnt ; ++i ) { if ( isInCycle[i] ) continue ; if ( contigGraph.IsInCycle( i, cycleNodes, visitTime ) ) { int cnt = cycleNodes.size() ; //printf( "===\n") ; for ( int j = 0 ; j < cnt ; ++j ) { //printf( "In cycle %d\n", cycleNodes[j] ) ; isInCycle[ cycleNodes[j] ] = true ; } } } //exit( 1 ) ; // Remove the connected edges involving the nodes in the cycle for ( i = 0 ; i < contigCnt ; ++i ) { if ( isInCycle[i] ) { for ( int dummy = 0 ; dummy <= 1 ; ++dummy ) { int ncnt = contigGraph.GetNeighbors( i, dummy, neighbors, MAX_NEIGHBOR ) ; for ( int j = 0 ; j < ncnt ; ++j ) { if ( neighbors[j].a == i + 2 * dummy - 1 && neighbors[j].b != dummy && genome.GetChrIdFromContigId( i ) == genome.GetChrIdFromContigId( neighbors[j].a ) ) continue ; // the connection created by the raw assembly else contigGraph.RemoveEdge( i, dummy, neighbors[j].a, neighbors[j].b ) ; } } } } //delete[] isInCycle ; //printf( "hi: %d %d\n", __LINE__, contigCnt ) ; //printf( "%d %d\n", contigGraph.GetNeighbors( 0, 0, neighbors, MAX_NEIGHBOR ), contigGraph.GetNeighbors( 0, 1, neighbors, MAX_NEIGHBOR ) ) ; // Sort the scaffolds from fasta file, so that longer scaffold come first int scafCnt = genome.GetChrCount() ; struct _pair *scafInfo = new struct _pair[scafCnt] ; memset( scafInfo, -1, sizeof( struct _pair) * scafCnt ) ; for ( i = 0 ; i < contigCnt ; ++i ) { int chrId = genome.GetChrIdFromContigId( i ) ; if ( scafInfo[chrId].a == -1 ) { scafInfo[ chrId ].a = i ; scafInfo[ chrId ].b = genome.GetChrLength( chrId ) ; } } qsort( scafInfo, scafCnt, sizeof( struct _pair ), CompScaffold ) ; // Merge the branches and build the scaffold ContigGraph scaffold( contigCnt, 2 * contigCnt ) ; // Use a method similar to topological sort bool *used = new bool[contigCnt] ; int *degree = new int[2 *contigCnt] ; int *danglingVisitTime = new int[contigCnt] ; int *counter = new int[contigCnt] ; int *visitDummy = new int[ contigCnt ] ; int *buffer = new int[contigCnt] ; int *buffer2 = new int[contigCnt] ; bool *isInQueue = new bool[ contigCnt ] ; int *chosen = new int[contigCnt] ; int chosenCnt ; memset( isInCycle, false, sizeof( bool ) * contigCnt ) ; memset( visitTime, -1, sizeof( int ) * contigCnt ) ; memset( visitDummy, -1, sizeof( int ) * contigCnt ) ; memset( counter, -1, sizeof( int ) * contigCnt ) ; // Use those memory to remove triangular cycles for ( i = 0 ; i < scafCnt ; ++i ) { int from, to ; if ( scafInfo[i].a == -1 ) continue ; genome.GetChrContigRange( genome.GetChrIdFromContigId( scafInfo[i].a ), from, to ) ; ForwardSearch( from, 0, i, visitTime, counter, visitDummy, contigGraph ) ; chosenCnt = 0 ; BackwardSearchForTriangularCycle( to, 1, i, visitTime, counter, visitDummy, contigGraph, chosen, chosenCnt ) ; for ( int j = 0 ; j < chosenCnt ; ++j ) { //printf( "%d\n", chosen[j] ) ; isInCycle[ chosen[j] ] = true ; } } for ( i = 0 ; i < contigCnt ; ++i ) { if ( isInCycle[i] ) { for ( int dummy = 0 ; dummy <= 1 ; ++dummy ) { int ncnt = contigGraph.GetNeighbors( i, dummy, neighbors, MAX_NEIGHBOR ) ; for ( int j = 0 ; j < ncnt ; ++j ) { if ( neighbors[j].a == i + 2 * dummy - 1 && neighbors[j].b != dummy && genome.GetChrIdFromContigId( i ) == genome.GetChrIdFromContigId( neighbors[j].a ) ) continue ; // the connection created by the raw assembly else contigGraph.RemoveEdge( i, dummy, neighbors[j].a, neighbors[j].b ) ; } } } } memset( used, false, sizeof( bool ) * contigCnt ) ; memset( visitTime, -1, sizeof( int ) * contigCnt ) ; memset( visitDummy, -1, sizeof( int ) * contigCnt ) ; memset( danglingVisitTime, -1, sizeof( int ) * contigCnt ) ; memset( counter, -1, sizeof( int ) * contigCnt ) ; memset( isInQueue, false, sizeof( bool ) * contigCnt ) ; ContigGraph newGraph( contigCnt, edgeCnt ) ; // Compute the gap size int *gapSize = new int[contigCnt] ; for ( i = 0 ; i < contigCnt - 1 ; ++i ) { if ( genome.GetChrIdFromContigId( i ) == genome.GetChrIdFromContigId( i + 1 ) ) { struct _contig c1 = genome.GetContigInfo( i ) ; struct _contig c2 = genome.GetContigInfo( i + 1 ) ; gapSize[i] = c2.start - c1.end - 1 ; } else gapSize[i] = -1 ; } // Start search int ncnt ; struct _pair *queue = new struct _pair[ contigCnt ] ; int head = 0, tail ; int danglingTime = 0 ; // Pre-allocate the subgraph. ContigGraph subgraph( contigCnt, 3 * contigCnt ) ; for ( i = 0 ; i < scafCnt ; ++i ) { //if ( used[144281] == true ) // printf( "changed %d %d\n", i, scafInfo[i - 1].a ) ; if ( scafInfo[i].a == -1 ) continue ; int from, to ; genome.GetChrContigRange( genome.GetChrIdFromContigId( scafInfo[i].a ), from, to ) ; //printf( "%d: %d %d %d\n", i, scafInfo[i].b, from, to ) ; ForwardSearch( from, 0, i, visitTime, counter, visitDummy, contigGraph ) ; chosenCnt = 0 ; BackwardSearch( to, 1, i, visitTime, counter, contigGraph, chosen, chosenCnt ) ; /*printf( "%s %d (%d %d) %d\n", alignments.GetChromName( genome.GetChrIdFromContigId( scafInfo[i].a ) ), i, from, to, chosenCnt ) ; if ( chosenCnt > 1 ) { printf( "=== " ) ; for ( int j = 0 ; j < chosenCnt ; ++j ) printf( "%d ", chosen[j] ) ; printf( "\n" ) ; }*/ for ( int j = 0 ; j < chosenCnt ; ++j ) { ncnt = contigGraph.GetNeighbors( chosen[j], 0, neighbors, MAX_NEIGHBOR ) ; //printf( "%d %d %d: %d %d %d\n", j, chosen[j], ncnt, neighbors[0].a, visitTime[ neighbors[0].a ], // counter[neighbors[0].a ] ) ; for ( int k = 0 ; k < ncnt ; ++k ) { //if ( i == 639 ) // printf( "Neighbor from 0 %d: %d %d\n", k, neighbors[k].a, neighbors[k].b ) ; if ( visitTime[ neighbors[k].a ] == 2 * i + 1 && counter[neighbors[k].a ] == 2 ) { subgraph.AddEdge( chosen[j], 0, neighbors[k].a, neighbors[k].b, true ) ; //printf( "subgraph: (%d %d)=>(%d %d)\n", chosen[j], 0, neighbors[k].a, neighbors[k].b ) ; } } ncnt = contigGraph.GetNeighbors( chosen[j], 1, neighbors, MAX_NEIGHBOR ) ; for ( int k = 0 ; k < ncnt ; ++k ) { //if ( i == 639 ) // printf( "Neighbor from 1 %d: %d %d\n", k, neighbors[k].a, neighbors[k].b ) ; if ( visitTime[ neighbors[k].a ] == 2 * i + 1 && counter[neighbors[k].a ] == 2 ) { subgraph.AddEdge( chosen[j], 1, neighbors[k].a, neighbors[k].b, true ) ; //printf( "subgraph: (%d %d)=>(%d %d)\n", chosen[j], 1, neighbors[k].a, neighbors[k].b ) ; } } } // Initialize the degree counter for ( int j = 0 ; j < chosenCnt ; ++j ) { for ( int l = 0 ; l < 2 ; ++l ) { /*if ( i == 6145 ) { std::vector<struct _pair> neighbors ; ncnt = subgraph.GetNeighbors( chosen[j], l, neighbors ) ; printf( "%d ncnt=%d\n", l, ncnt ) ; }*/ ncnt = subgraph.GetNeighbors( chosen[j], l, neighbors, MAX_NEIGHBOR ) ; degree[ 2 * chosen[j] + l ] = ncnt ; } } // "topological" sort head = 0 ; isInQueue[from] = true ; queue[0].a = from ; queue[0].b = 0 ; tail = 1 ; int prevTag = -1 ; int *prevAdd = buffer ; // reuse counter to save some memory. int *nextAdd = buffer2 ; int firstAdd = -1 ; while ( head < tail ) { int tailTag = tail ; for ( int j = head ; j < tailTag ; ++j ) { nextAdd[j] = -1 ; if ( !used[ queue[j].a ] ) { used[ queue[j].a ] = true ; if ( prevTag != -1 ) { scaffold.AddEdge( queue[ prevTag].a, 1 - queue[prevTag].b, queue[j].a, queue[j].b ) ; nextAdd[ prevTag ] = j ; /*if ( i == 639 ) printf( "(%lld %lld)=>(%lld %lld)\n", queue[ prevTag].a, 1 - queue[prevTag].b, queue[j].a, queue[j].b ) ;*/ } else firstAdd = j ; prevTag = j ; } prevAdd[j] = prevTag ; // the most recent(<=) queue id when added to scaffold. ncnt = subgraph.GetNeighbors( queue[j].a, 1 - queue[j].b, neighbors, MAX_NEIGHBOR ) ; for ( int k = 0 ; k < ncnt ; ++k ) { --degree[ 2 * neighbors[k].a + neighbors[k].b ] ; if ( degree[ 2 * neighbors[k].a + neighbors[k].b ] == 0 && !isInQueue[neighbors[k].a] ) { isInQueue[ neighbors[k].a ] = true ; queue[ tail ] = neighbors[k] ; // Interesting assignment, I think. ++tail ; /*if ( i == 639 ) printf( "pushed in queue: %d\n", neighbors[k].a ) ;*/ // Put the consecutive contigs together. struct _pair testNeighbors[ MAX_NEIGHBOR ] ; struct _pair tag ; tag = neighbors[k] ; while ( 1 ) { if ( contigGraph.GetNeighbors( tag.a, 1 - tag.b, testNeighbors, MAX_NEIGHBOR ) != 1 ) break ; int n = subgraph.GetNeighbors( tag.a, 1 - tag.b, testNeighbors, MAX_NEIGHBOR ) ; if ( n != 1 ) break ; //printf( "%d %d\n", n, testNeighbors[0].a ) ; struct _pair backNeighbors[ MAX_NEIGHBOR ] ; if ( contigGraph.GetNeighbors( testNeighbors[0].a, testNeighbors[0].b, backNeighbors, MAX_NEIGHBOR ) != 1 ) break ; n = subgraph.GetNeighbors( testNeighbors[0].a, testNeighbors[0].b, backNeighbors, MAX_NEIGHBOR ) ; if ( n != 1 ) break ; isInQueue[ testNeighbors[0].a ] = true ; queue[tail] = testNeighbors[0] ; ++tail ; /*if ( i == 639 ) printf( "pushed in queue: %d\n", testNeighbors[0].a ) ;*/ tag = testNeighbors[0] ; } } } } head = tailTag ; } // Remove the effect on the subgraph. /*if ( tail != chosenCnt ) { printf( "WARNING: not matched\n" ) ; exit( 1 ) ; }*/ for ( int j = 0 ; j < tail ; ++j ) { visitDummy[ queue[j].a ] = -1 ; counter[ queue[j].a ] = -1 ; subgraph.RemoveAdjacentEdges( queue[j].a ) ; isInQueue[ queue[j].a ] = false ; } subgraph.ResetEdgeUsed() ; // no point is picked if ( prevTag == -1 ) { continue ; } // Update the gap size prevTag = -1 ; for ( int j = 0 ; j < tail - 1 ; ++j ) { if ( genome.GetChrIdFromContigId( queue[j].a ) == genome.GetChrIdFromContigId( from ) ) prevTag = queue[j].a ; else if ( prevTag != -1 ) { struct _contig c = genome.GetContigInfo( queue[j].a ) ; gapSize[prevTag] -= ( c.end - c.start + 1) ; } } // Add the dangling contigs. Use the fact that the queue holding the contigs in the same order as in the scaffold. // 5'->3' dangling int *chosenDummy = degree ; for ( int j = tail - 1 ; j >= 0 ; --j ) { //if ( j < tail - 1 ) // continue ; chosenCnt = 0 ; //if ( queue[j].a == 0 ) // printf( "Dummy: %d %d %d\n", j, queue[j].b, 1 - queue[j].b ) ; SearchDangling( queue[j].a, queue[j].b, used, danglingTime, danglingVisitTime, contigGraph, false, chosen, chosenDummy, chosenCnt, genome ) ; ++danglingTime ; int prevTag = prevAdd[j] ; /*if ( queue[j].a == 0 ) { struct _pair neighbors[5] ; int ncnt = contigGraph.GetNeighbors( queue[j].a, 1 - queue[j].b, neighbors, 5 ) ; printf( "%d %d %d %d: %d %d\n", queue[j].b, chosenCnt, prevTag, ncnt, neighbors[0].a, used[ neighbors[0].a ] ) ; }*/ if ( prevTag == -1 ) break ; // Trim the dangling list int k = chosenCnt - 1 ; if ( j > 0 && j < tail - 1 ) { for ( k = chosenCnt - 1 ; k >= 1 ; --k ) if ( genome.GetChrIdFromContigId( chosen[k] ) != genome.GetChrIdFromContigId( chosen[k - 1] ) ) break ; } // Test the gap size int len = 0 ; for ( int l = 0 ; l <= k ; ++l ) { struct _contig c = genome.GetContigInfo( chosen[k] ) ; len += c.end - c.start + 1 ; } if ( j < tail - 1 ) { int l ; for ( l = j ; l >= 0 ; --l ) if ( genome.GetChrIdFromContigId( queue[l].a ) == genome.GetChrIdFromContigId( from ) ) break ; if ( !ignoreGap && len >= gapSize[ queue[l].a ] + 100 ) continue ; else gapSize[ queue[l].a ] -= len ; } for ( ; k >= 0 ; --k ) { used[ chosen[k] ] = true ; //printf( "Dangling 1: %d=>%d\n", queue[prevTag].a, chosen[k] ) ; scaffold.InsertNode( queue[ prevTag ].a, 1 - queue[ prevTag ].b, chosen[k], chosenDummy[k] ) ; } } // 3'->5' dangling for ( int j = 0 ; j < tail ; ++j ) { //if ( j > 0 ) // continue ; chosenCnt = 0 ; SearchDangling( queue[j].a, 1 - queue[j].b, used, danglingTime, danglingVisitTime, contigGraph, false, chosen, chosenDummy, chosenCnt, genome ) ; ++danglingTime ; int prevTag = prevAdd[j] ; int nextTag ; if ( prevTag == -1 || j <= firstAdd ) nextTag = firstAdd ; else if ( j == prevTag ) nextTag = j ; else nextTag = nextAdd[ prevTag ] ; if ( nextTag == -1 ) break ; /*if ( queue[j].a == 37549 ) { struct _pair neighbors[5] ; int ncnt = contigGraph.GetNeighbors( queue[j].a, queue[j].b, neighbors, 5 ) ; fprintf( stderr, "%d %d %d: %d %d %d: %d %d %d\n", j, queue[j].a, queue[j].b, chosenCnt, nextTag, ncnt, chosen[0], chosenDummy[0], used[ chosen[0] ] ) ; }*/ // trim the danling list int k = chosenCnt - 1 ; if ( j < tail - 1 && j > 0 ) { for ( k = chosenCnt - 1 ; k >= 1 ; --k ) if ( genome.GetChrIdFromContigId( chosen[k] ) != genome.GetChrIdFromContigId( chosen[k - 1] ) ) break ; } // Test the gap size int len = 0 ; for ( int l = 0 ; l <= k ; ++l ) { struct _contig c = genome.GetContigInfo( chosen[k] ) ; len += c.end - c.start + 1 ; } if ( j > 0 ) { int l ; for ( l = j - 1 ; l >= 0 ; --l ) // Notice the j-1 here, because we want the gap strictly before current contig if ( genome.GetChrIdFromContigId( queue[l].a ) == genome.GetChrIdFromContigId( from ) ) break ; if ( !ignoreGap && len >= gapSize[ queue[l].a ] + 100 ) continue ; else gapSize[ queue[l].a ] -= len ; } for ( ; k >= 0 ; --k ) { used[ chosen[k] ] = true ; scaffold.InsertNode( queue[nextTag].a, queue[nextTag].b, chosen[k], chosenDummy[k] ) ; //printf( "Dangling 2: %d<=%d\n", queue[nextTag].a, chosen[k] ) ; //if ( chosen[k] == 10246 ) // printf( "hi %d %d %d %d\n", j, queue[j].a, k, chosen[k] ) ; } } } //return 0 ; // Output the scaffold int id = 0 ; char infoFileName[512] ; char outputFileName[512] ; sprintf( infoFileName, "%s.info", prefix ) ; sprintf( outputFileName, "%s.fa", prefix ) ; outputFile = fopen( outputFileName, "w" ) ; infoFile = fopen( infoFileName, "w") ; memset( used, false, sizeof( bool ) * contigCnt ) ; for ( i = 0 ; i < contigCnt ; ++i ) { //printf( "%d (%s)\n", i, alignments.GetChromName( genome.GetChrIdFromContigId( i ) ) ) ; fflush( stdout ) ; /*if ( i == 10246 ) { std::vector<struct _pair> neighbors ; scaffold.GetNeighbors( i, 0, neighbors ) ; printf( "%u\n", neighbors.size() ) ; }*/ if ( used[i] ) continue ; int ncnt1 = scaffold.GetNeighbors( i, 0, neighbors, MAX_NEIGHBOR ) ; int ncnt2 = scaffold.GetNeighbors( i, 1, neighbors, MAX_NEIGHBOR ) ; if ( ncnt1 == 0 || ncnt2 == 0 ) // The end of a scaffold { fprintf( outputFile, ">scaffold_%d\n", id) ; fprintf( infoFile, ">scaffold_%d", id ) ; ++id ; int p = i ; int dummyP = 1 ; if ( ncnt1 == 0 ) dummyP = 0 ; used[i] = true ; genome.PrintContig( outputFile, i, dummyP ) ; fprintf( infoFile, " (%s %d %c)", alignments.GetChromName( genome.GetChrIdFromContigId( p ) ), p, dummyP == 0 ? '+' : '-' ) ; while ( 1 ) { ncnt = scaffold.GetNeighbors( p, 1 - dummyP, neighbors, MAX_NEIGHBOR ) ; if ( ncnt == 0 ) break ; // ncnt must be 1 int insertN = 17 ; if ( genome.GetChrIdFromContigId( p ) == genome.GetChrIdFromContigId( neighbors[0].a ) ) { struct _contig cp, cna ; cp = genome.GetContigInfo( p ) ; cna = genome.GetContigInfo( neighbors[0].a ) ; if ( p < neighbors[0].a ) insertN = cna.start - cp.end - 1 ; else if ( p > neighbors[0].a ) insertN = cp.start - cna.end - 1 ; } p = neighbors[0].a ; dummyP = neighbors[0].b ; for ( int j = 0 ; j < insertN ; ++j ) fprintf( outputFile, "N" ) ; used[p] = true ; genome.PrintContig( outputFile, p, dummyP ) ; fprintf( infoFile, " (%s %d %c)", alignments.GetChromName( genome.GetChrIdFromContigId( p ) ), p, dummyP == 0 ? '+' : '-' ) ; } fprintf( outputFile, "\n" ) ; fprintf( infoFile, "\n" ) ; } } for ( i = 0 ; i < contigCnt ; ++i ) if ( !used[i] ) { fprintf( stderr, "Unreported contig %d.\n", i ) ; } fclose( outputFile ) ; fclose( infoFile ) ; delete[] buffer ; delete[] buffer2 ; delete[] chosen ; delete[] queue ; delete[] counter ; delete[] visitTime ; delete[] used ; delete[] scafInfo ; delete[] isInQueue ; delete[] gapSize ; //fclose( rascafFile ) ; return 0 ; }
int main( int argc, char *argv[] ) { int i ; int ret ; Alignments alignments ; Alignments clippedAlignments ; Blocks blocks ; Genome genome ; char *genomeFile = NULL ; if ( argc < 2 ) { printf( "%s", usage ) ; exit( 0 ) ; } minimumSupport = 2 ; minimumEffectiveLength = 200 ; kmerSize = 23 ; breakN = 1 ; minContigSize = 200 ; prefix = NULL ; VERBOSE = false ; outputConnectionSequence = false ; aggressiveMode = false ; for ( i = 1 ; i < argc ; ++i ) { if ( !strcmp( "-b", argv[i] ) ) { alignments.Open( argv[i + 1]) ; ++i ; } else if ( !strcmp( "-o", argv[i] ) ) { prefix = argv[i + 1] ; ++i ; } else if ( !strcmp( "-f", argv[i] ) ) { genomeFile = argv[i + 1] ; ++i ; } else if ( !strcmp( "-ms", argv[i] ) ) { minimumSupport = atoi( argv[i + 1] ) ; ++i ; } else if ( !strcmp( "-ml", argv[i] ) ) { minimumEffectiveLength = atoi( argv[i + 1] ) ; ++i ; } else if ( !strcmp( "-k", argv[i] ) ) { kmerSize = atoi( argv[i + 1] ) ; ++i ; } else if ( !strcmp( "-breakN", argv[i] ) ) { breakN = atoi( argv[i + 1] ) ; ++i ; } else if ( !strcmp( "-minContigSize", argv[i] ) ) { minContigSize = atoi( argv[i + 1] ) ; ++i ; } else if ( !strcmp( "-v", argv[i] ) ) { VERBOSE = true ; } else if ( !strcmp( "-cs", argv[i] ) ) { outputConnectionSequence = true ; } /*else if ( !strcmp( "-aggressive", argv[i] ) ) { aggressiveMode = true ; }*/ else if ( !strcmp( "-bc", argv[i] ) ) { // So far, assume the input is from BWA mem clippedAlignments.Open( argv[i + 1] ) ; clippedAlignments.SetAllowSupplementary( true ) ; ++i ; } else { fprintf( stderr, "Unknown parameter: %s\n", argv[i] ) ; exit( 1 ) ; } } if ( !alignments.IsOpened() ) { printf( "Must use -b to specify the bam file." ) ; return 0 ; } if ( prefix != NULL ) { char buffer[255] ; sprintf( buffer, "%s.out", prefix ) ; fpOut = fopen( buffer, "w" ) ; } else { char buffer[255] ; prefix = strdup( "rascaf" ) ; sprintf( buffer, "%s.out", prefix ) ; fpOut = fopen( buffer, "w" ) ; } if ( genomeFile != NULL ) { genome.Open( alignments, genomeFile ) ; alignments.Rewind() ; } if ( outputConnectionSequence == true && genomeFile == NULL ) { fprintf( stderr, "Must use -f to specify assembly file when using -cs\n" ) ; exit( EXIT_FAILURE ) ; } // 74619 //printf( "%c\n", genome.GetNucleotide( 74619, 4 ) ) ; //exit(0) ; // Build the graph ret = blocks.BuildExonBlocks( alignments, genome ) ; alignments.Rewind() ; fprintf( stderr, "Found %d exon blocks.\n", ret ) ; if ( clippedAlignments.IsOpened() ) { fprintf( stderr, "Extend exon blocks with clipped alignments.\n" ) ; Blocks extendBlocks ; extendBlocks.BuildExonBlocks( clippedAlignments, genome ) ; clippedAlignments.Rewind() ; ret = blocks.ExtendExonBlocks( extendBlocks ) ; fprintf( stderr, "Found %d exon blocks after extension.\n", ret ) ; } blocks.GetAlignmentsInfo( alignments ) ; alignments.Rewind() ; ret = blocks.BuildGeneBlocks( alignments, genome ) ; alignments.Rewind() ; fprintf( stderr, "Found %d gene blocks.\n", ret ) ; blocks.BuildGeneBlockGraph( alignments ) ; if ( clippedAlignments.IsOpened() ) { blocks.AddGeneBlockGraphByClippedAlignments( clippedAlignments ) ; } // Cleaning blocks.CleanGeneBlockGraph( alignments, genome ) ; // Scaffolding Scaffold scaffold( blocks, genome ) ; //scaffold.Init( blocks ) ; int componentCnt = scaffold.BuildComponent() ; fprintf( stderr, "Found %d non-trivial gene block components.\n", componentCnt ) ; // Possible for parallelization for ( i = 0 ; i < componentCnt ; ++i ) { scaffold.ScaffoldComponent( i ) ; } scaffold.ScaffoldGenome() ; // Output the command line fprintf( fpOut, "command line: " ) ; char *fullpath = (char *)malloc( sizeof( char ) * 4096 ) ; for ( i = 0 ; i < argc ; ++i ) { char c = ' ' ; if ( i == argc - 1 ) c = '\n' ; if ( i > 0 && !strcmp( argv[i - 1], "-b" ) ) { if ( realpath( argv[i], fullpath ) == NULL ) { fprintf( stderr, "Failed to resolve the path of file %s.\n", argv[i] ) ; exit( 1 ) ; } fprintf( fpOut, "%s%c", fullpath, c ) ; } else if ( i > 0 && !strcmp( argv[i - 1], "-f" ) ) { if ( realpath( argv[i], fullpath ) == NULL ) { fprintf( stderr, "Failed to resolve the path of file %s.\n", argv[i] ) ; exit( 1 ) ; } fprintf( fpOut, "%s%c", fullpath, c ) ; } else fprintf( fpOut, "%s%c", argv[i], c ) ; } free( fullpath ) ; scaffold.Output( fpOut, alignments ) ; return 0 ; }