int main( int argc, char *argv[] ) { int i ; int ret ; Alignments alignments ; Alignments clippedAlignments ; Blocks blocks ; Genome genome ; char *genomeFile = NULL ; if ( argc < 2 ) { printf( "%s", usage ) ; exit( 0 ) ; } minimumSupport = 2 ; minimumEffectiveLength = 200 ; kmerSize = 23 ; breakN = 1 ; minContigSize = 200 ; prefix = NULL ; VERBOSE = false ; outputConnectionSequence = false ; aggressiveMode = false ; for ( i = 1 ; i < argc ; ++i ) { if ( !strcmp( "-b", argv[i] ) ) { alignments.Open( argv[i + 1]) ; ++i ; } else if ( !strcmp( "-o", argv[i] ) ) { prefix = argv[i + 1] ; ++i ; } else if ( !strcmp( "-f", argv[i] ) ) { genomeFile = argv[i + 1] ; ++i ; } else if ( !strcmp( "-ms", argv[i] ) ) { minimumSupport = atoi( argv[i + 1] ) ; ++i ; } else if ( !strcmp( "-ml", argv[i] ) ) { minimumEffectiveLength = atoi( argv[i + 1] ) ; ++i ; } else if ( !strcmp( "-k", argv[i] ) ) { kmerSize = atoi( argv[i + 1] ) ; ++i ; } else if ( !strcmp( "-breakN", argv[i] ) ) { breakN = atoi( argv[i + 1] ) ; ++i ; } else if ( !strcmp( "-minContigSize", argv[i] ) ) { minContigSize = atoi( argv[i + 1] ) ; ++i ; } else if ( !strcmp( "-v", argv[i] ) ) { VERBOSE = true ; } else if ( !strcmp( "-cs", argv[i] ) ) { outputConnectionSequence = true ; } /*else if ( !strcmp( "-aggressive", argv[i] ) ) { aggressiveMode = true ; }*/ else if ( !strcmp( "-bc", argv[i] ) ) { // So far, assume the input is from BWA mem clippedAlignments.Open( argv[i + 1] ) ; clippedAlignments.SetAllowSupplementary( true ) ; ++i ; } else { fprintf( stderr, "Unknown parameter: %s\n", argv[i] ) ; exit( 1 ) ; } } if ( !alignments.IsOpened() ) { printf( "Must use -b to specify the bam file." ) ; return 0 ; } if ( prefix != NULL ) { char buffer[255] ; sprintf( buffer, "%s.out", prefix ) ; fpOut = fopen( buffer, "w" ) ; } else { char buffer[255] ; prefix = strdup( "rascaf" ) ; sprintf( buffer, "%s.out", prefix ) ; fpOut = fopen( buffer, "w" ) ; } if ( genomeFile != NULL ) { genome.Open( alignments, genomeFile ) ; alignments.Rewind() ; } if ( outputConnectionSequence == true && genomeFile == NULL ) { fprintf( stderr, "Must use -f to specify assembly file when using -cs\n" ) ; exit( EXIT_FAILURE ) ; } // 74619 //printf( "%c\n", genome.GetNucleotide( 74619, 4 ) ) ; //exit(0) ; // Build the graph ret = blocks.BuildExonBlocks( alignments, genome ) ; alignments.Rewind() ; fprintf( stderr, "Found %d exon blocks.\n", ret ) ; if ( clippedAlignments.IsOpened() ) { fprintf( stderr, "Extend exon blocks with clipped alignments.\n" ) ; Blocks extendBlocks ; extendBlocks.BuildExonBlocks( clippedAlignments, genome ) ; clippedAlignments.Rewind() ; ret = blocks.ExtendExonBlocks( extendBlocks ) ; fprintf( stderr, "Found %d exon blocks after extension.\n", ret ) ; } blocks.GetAlignmentsInfo( alignments ) ; alignments.Rewind() ; ret = blocks.BuildGeneBlocks( alignments, genome ) ; alignments.Rewind() ; fprintf( stderr, "Found %d gene blocks.\n", ret ) ; blocks.BuildGeneBlockGraph( alignments ) ; if ( clippedAlignments.IsOpened() ) { blocks.AddGeneBlockGraphByClippedAlignments( clippedAlignments ) ; } // Cleaning blocks.CleanGeneBlockGraph( alignments, genome ) ; // Scaffolding Scaffold scaffold( blocks, genome ) ; //scaffold.Init( blocks ) ; int componentCnt = scaffold.BuildComponent() ; fprintf( stderr, "Found %d non-trivial gene block components.\n", componentCnt ) ; // Possible for parallelization for ( i = 0 ; i < componentCnt ; ++i ) { scaffold.ScaffoldComponent( i ) ; } scaffold.ScaffoldGenome() ; // Output the command line fprintf( fpOut, "command line: " ) ; char *fullpath = (char *)malloc( sizeof( char ) * 4096 ) ; for ( i = 0 ; i < argc ; ++i ) { char c = ' ' ; if ( i == argc - 1 ) c = '\n' ; if ( i > 0 && !strcmp( argv[i - 1], "-b" ) ) { if ( realpath( argv[i], fullpath ) == NULL ) { fprintf( stderr, "Failed to resolve the path of file %s.\n", argv[i] ) ; exit( 1 ) ; } fprintf( fpOut, "%s%c", fullpath, c ) ; } else if ( i > 0 && !strcmp( argv[i - 1], "-f" ) ) { if ( realpath( argv[i], fullpath ) == NULL ) { fprintf( stderr, "Failed to resolve the path of file %s.\n", argv[i] ) ; exit( 1 ) ; } fprintf( fpOut, "%s%c", fullpath, c ) ; } else fprintf( fpOut, "%s%c", argv[i], c ) ; } free( fullpath ) ; scaffold.Output( fpOut, alignments ) ; return 0 ; }
int main( int argc, char *argv[] ) { Alignments alignments ; Genome genome ; std::vector<int> rascafFileId ; char line[2048] ; char prefix[512] = "rascaf_scaffold" ; int rawAssemblyInd = 1 ; FILE *rascafFile ; bool contigLevel = false ; int i ; FILE *outputFile ; FILE *infoFile ; breakN = 1 ; if ( argc < 2 ) { fprintf( stderr, "%s", usage ) ; exit( 1 ) ; } for ( i = 1 ; i < argc ; ++i ) { if ( !strcmp( "-o", argv[i] ) ) { strcpy( prefix, argv[i + 1 ] ) ; ++i ; } else if ( !strcmp( "-ms", argv[i] ) ) { minSupport = atoi( argv[i + 1] ) ; ++i ; } else if ( !strcmp( "-ignoreGap", argv[i] ) ) { ignoreGap = true ; } else if ( !strcmp( "-r", argv[i] ) ) { rascafFileId.push_back( i + 1 ) ; ++i ; } else { fprintf( stderr, "Unknown option: %s\n", argv[i] ) ; exit( EXIT_FAILURE ) ; } } if ( rascafFileId.size() == 0 ) { fprintf( stderr, "Must use -r to specify rascaf output file.\n" ) ; exit( EXIT_FAILURE ) ; } MAX_NEIGHBOR = 1 + rascafFileId.size() ; // Get the bam file. rascafFile = fopen( argv[ rascafFileId[0] ], "r" ) ; while ( fgets( line, sizeof( line ), rascafFile ) != NULL ) { if ( strstr( line, "command line:" ) ) { char *p ; char buffer[512] ; p = strstr( line, "-breakN" ) ; if ( p != NULL ) { p += 7 ; while ( *p == ' ' ) ++p ; for ( i = 0 ; *p && *p != ' ' ; ++p, ++i ) buffer[i] = *p ; buffer[i] = '\0' ; breakN = atoi( buffer ) ; } p = strstr( line, "-b" ) ; if ( p == NULL ) { fprintf( stderr, "Could not find the bam file specified by -b in Rascaf.\n" ) ; exit( 1 ) ; } p += 2 ; while ( *p == ' ' ) ++p ; for ( i = 0 ; *p && *p != ' ' ; ++p, ++i ) buffer[i] = *p ; buffer[i] = '\0' ; alignments.Open( buffer ) ; p = strstr( line, "-f") ; if ( p == NULL ) { fprintf( stderr, "Could not find the raw assembly file specified by -f in Rascaf.\n" ) ; exit( 1 ) ; } p += 2 ; while ( *p == ' ' ) ++p ; for ( i = 0 ; *p && *p != ' ' && *p != '\n' ; ++p, ++i ) buffer[i] = *p ; buffer[i] = '\0' ; fprintf( stderr, "Found raw assembly file: %s\n", buffer ) ; genome.Open( alignments, buffer ) ; break ; } } fclose( rascafFile ) ; // Parse the input. for ( unsigned int fid = 0 ; fid < rascafFileId.size() ; ++fid ) { rascafFile = fopen( argv[ rascafFileId[fid] ], "r" ) ; bool start = false ; int tag ; while ( fgets( line, sizeof( line ), rascafFile ) != NULL ) { if ( strstr( line, "command line:" ) ) { start = true ; if ( strstr( line, "-f" ) ) { contigLevel = true ; } continue ; } if ( !start ) continue ; if ( !strcmp( line, "WARNINGS:\n" ) ) break ; std::vector<struct _part> nparts ; if ( line[0] >= '0' && line[0] <= '9' ) { AddConnection( line, alignments, nparts ) ; connects.push_back( nparts ) ; tag = 0 ; } else if ( line[0] == '\t' || line[0] == ' ' ) { // Break the nparts if the support is too low. int num = 0 ; for ( i = 0 ; line[i] < '0' || line[i] > '9' ; ++i ) ; for ( ; line[i] >= '0' && line[i] <= '9' ; ++i ) num = num * 10 + line[i] - '0' ; ++tag ; if ( num < minSupport ) { nparts = connects.back() ; connects.pop_back() ; int size = nparts.size() ; std::vector<struct _part> newNParts ; for ( i = 0 ; i < tag ; ++i ) newNParts.push_back( nparts[i] ) ; if ( newNParts.size() > 1 ) connects.push_back( newNParts ) ; newNParts.clear() ; for ( ; i < size ; ++i ) newNParts.push_back( nparts[i] ) ; if ( newNParts.size() > 1 ) connects.push_back( newNParts ) ; tag = 0 ; } } } fclose( rascafFile ) ; } if ( contigLevel == false ) { genome.SetIsOpen( contigLevel ) ; } // Build the graph int contigCnt = genome.GetContigCount() ; int edgeCnt = 0 ; int csize = connects.size() ; for ( i = 0 ; i < csize ; ++i ) edgeCnt += connects[i].size() ; ContigGraph contigGraph( contigCnt, contigCnt + edgeCnt ) ; for ( i = 0 ; i < contigCnt - 1 ; ++i ) { if ( genome.GetChrIdFromContigId( i ) == genome.GetChrIdFromContigId( i + 1 ) ) { contigGraph.AddEdge( i, 1, i + 1, 0 ) ; } } for ( i = 0 ; i < csize ; ++i ) { std::vector<struct _part> &parts = connects[i] ; int size = parts.size() ; for ( int j = 0 ; j < size - 1 ; ++j ) { struct _part &a = parts[j] ; struct _part &b = parts[j + 1] ; // Two dummy nodes for each contig. Left is 0, right is 1 int dummyU = 0 ; int dummyV = 0 ; if ( a.strand == '+' ) dummyU = 1 ; if ( b.strand == '-' ) dummyV = 1 ; contigGraph.AddEdge( a.contigId, dummyU, b.contigId, dummyV, true ) ; } } // Check the cycles in the contig graph. This may introduces when combining different rascaf outputs. int *visitTime = new int[contigCnt] ; struct _pair *neighbors = new struct _pair[ MAX_NEIGHBOR ] ; bool *isInCycle = new bool[contigCnt] ; std::vector<int> cycleNodes ; memset( visitTime, -1, sizeof( int ) * contigCnt ) ; memset( isInCycle, false, sizeof( bool ) * contigCnt ) ; for ( i = 0 ; i < contigCnt ; ++i ) { if ( isInCycle[i] ) continue ; if ( contigGraph.IsInCycle( i, cycleNodes, visitTime ) ) { int cnt = cycleNodes.size() ; //printf( "===\n") ; for ( int j = 0 ; j < cnt ; ++j ) { //printf( "In cycle %d\n", cycleNodes[j] ) ; isInCycle[ cycleNodes[j] ] = true ; } } } //exit( 1 ) ; // Remove the connected edges involving the nodes in the cycle for ( i = 0 ; i < contigCnt ; ++i ) { if ( isInCycle[i] ) { for ( int dummy = 0 ; dummy <= 1 ; ++dummy ) { int ncnt = contigGraph.GetNeighbors( i, dummy, neighbors, MAX_NEIGHBOR ) ; for ( int j = 0 ; j < ncnt ; ++j ) { if ( neighbors[j].a == i + 2 * dummy - 1 && neighbors[j].b != dummy && genome.GetChrIdFromContigId( i ) == genome.GetChrIdFromContigId( neighbors[j].a ) ) continue ; // the connection created by the raw assembly else contigGraph.RemoveEdge( i, dummy, neighbors[j].a, neighbors[j].b ) ; } } } } //delete[] isInCycle ; //printf( "hi: %d %d\n", __LINE__, contigCnt ) ; //printf( "%d %d\n", contigGraph.GetNeighbors( 0, 0, neighbors, MAX_NEIGHBOR ), contigGraph.GetNeighbors( 0, 1, neighbors, MAX_NEIGHBOR ) ) ; // Sort the scaffolds from fasta file, so that longer scaffold come first int scafCnt = genome.GetChrCount() ; struct _pair *scafInfo = new struct _pair[scafCnt] ; memset( scafInfo, -1, sizeof( struct _pair) * scafCnt ) ; for ( i = 0 ; i < contigCnt ; ++i ) { int chrId = genome.GetChrIdFromContigId( i ) ; if ( scafInfo[chrId].a == -1 ) { scafInfo[ chrId ].a = i ; scafInfo[ chrId ].b = genome.GetChrLength( chrId ) ; } } qsort( scafInfo, scafCnt, sizeof( struct _pair ), CompScaffold ) ; // Merge the branches and build the scaffold ContigGraph scaffold( contigCnt, 2 * contigCnt ) ; // Use a method similar to topological sort bool *used = new bool[contigCnt] ; int *degree = new int[2 *contigCnt] ; int *danglingVisitTime = new int[contigCnt] ; int *counter = new int[contigCnt] ; int *visitDummy = new int[ contigCnt ] ; int *buffer = new int[contigCnt] ; int *buffer2 = new int[contigCnt] ; bool *isInQueue = new bool[ contigCnt ] ; int *chosen = new int[contigCnt] ; int chosenCnt ; memset( isInCycle, false, sizeof( bool ) * contigCnt ) ; memset( visitTime, -1, sizeof( int ) * contigCnt ) ; memset( visitDummy, -1, sizeof( int ) * contigCnt ) ; memset( counter, -1, sizeof( int ) * contigCnt ) ; // Use those memory to remove triangular cycles for ( i = 0 ; i < scafCnt ; ++i ) { int from, to ; if ( scafInfo[i].a == -1 ) continue ; genome.GetChrContigRange( genome.GetChrIdFromContigId( scafInfo[i].a ), from, to ) ; ForwardSearch( from, 0, i, visitTime, counter, visitDummy, contigGraph ) ; chosenCnt = 0 ; BackwardSearchForTriangularCycle( to, 1, i, visitTime, counter, visitDummy, contigGraph, chosen, chosenCnt ) ; for ( int j = 0 ; j < chosenCnt ; ++j ) { //printf( "%d\n", chosen[j] ) ; isInCycle[ chosen[j] ] = true ; } } for ( i = 0 ; i < contigCnt ; ++i ) { if ( isInCycle[i] ) { for ( int dummy = 0 ; dummy <= 1 ; ++dummy ) { int ncnt = contigGraph.GetNeighbors( i, dummy, neighbors, MAX_NEIGHBOR ) ; for ( int j = 0 ; j < ncnt ; ++j ) { if ( neighbors[j].a == i + 2 * dummy - 1 && neighbors[j].b != dummy && genome.GetChrIdFromContigId( i ) == genome.GetChrIdFromContigId( neighbors[j].a ) ) continue ; // the connection created by the raw assembly else contigGraph.RemoveEdge( i, dummy, neighbors[j].a, neighbors[j].b ) ; } } } } memset( used, false, sizeof( bool ) * contigCnt ) ; memset( visitTime, -1, sizeof( int ) * contigCnt ) ; memset( visitDummy, -1, sizeof( int ) * contigCnt ) ; memset( danglingVisitTime, -1, sizeof( int ) * contigCnt ) ; memset( counter, -1, sizeof( int ) * contigCnt ) ; memset( isInQueue, false, sizeof( bool ) * contigCnt ) ; ContigGraph newGraph( contigCnt, edgeCnt ) ; // Compute the gap size int *gapSize = new int[contigCnt] ; for ( i = 0 ; i < contigCnt - 1 ; ++i ) { if ( genome.GetChrIdFromContigId( i ) == genome.GetChrIdFromContigId( i + 1 ) ) { struct _contig c1 = genome.GetContigInfo( i ) ; struct _contig c2 = genome.GetContigInfo( i + 1 ) ; gapSize[i] = c2.start - c1.end - 1 ; } else gapSize[i] = -1 ; } // Start search int ncnt ; struct _pair *queue = new struct _pair[ contigCnt ] ; int head = 0, tail ; int danglingTime = 0 ; // Pre-allocate the subgraph. ContigGraph subgraph( contigCnt, 3 * contigCnt ) ; for ( i = 0 ; i < scafCnt ; ++i ) { //if ( used[144281] == true ) // printf( "changed %d %d\n", i, scafInfo[i - 1].a ) ; if ( scafInfo[i].a == -1 ) continue ; int from, to ; genome.GetChrContigRange( genome.GetChrIdFromContigId( scafInfo[i].a ), from, to ) ; //printf( "%d: %d %d %d\n", i, scafInfo[i].b, from, to ) ; ForwardSearch( from, 0, i, visitTime, counter, visitDummy, contigGraph ) ; chosenCnt = 0 ; BackwardSearch( to, 1, i, visitTime, counter, contigGraph, chosen, chosenCnt ) ; /*printf( "%s %d (%d %d) %d\n", alignments.GetChromName( genome.GetChrIdFromContigId( scafInfo[i].a ) ), i, from, to, chosenCnt ) ; if ( chosenCnt > 1 ) { printf( "=== " ) ; for ( int j = 0 ; j < chosenCnt ; ++j ) printf( "%d ", chosen[j] ) ; printf( "\n" ) ; }*/ for ( int j = 0 ; j < chosenCnt ; ++j ) { ncnt = contigGraph.GetNeighbors( chosen[j], 0, neighbors, MAX_NEIGHBOR ) ; //printf( "%d %d %d: %d %d %d\n", j, chosen[j], ncnt, neighbors[0].a, visitTime[ neighbors[0].a ], // counter[neighbors[0].a ] ) ; for ( int k = 0 ; k < ncnt ; ++k ) { //if ( i == 639 ) // printf( "Neighbor from 0 %d: %d %d\n", k, neighbors[k].a, neighbors[k].b ) ; if ( visitTime[ neighbors[k].a ] == 2 * i + 1 && counter[neighbors[k].a ] == 2 ) { subgraph.AddEdge( chosen[j], 0, neighbors[k].a, neighbors[k].b, true ) ; //printf( "subgraph: (%d %d)=>(%d %d)\n", chosen[j], 0, neighbors[k].a, neighbors[k].b ) ; } } ncnt = contigGraph.GetNeighbors( chosen[j], 1, neighbors, MAX_NEIGHBOR ) ; for ( int k = 0 ; k < ncnt ; ++k ) { //if ( i == 639 ) // printf( "Neighbor from 1 %d: %d %d\n", k, neighbors[k].a, neighbors[k].b ) ; if ( visitTime[ neighbors[k].a ] == 2 * i + 1 && counter[neighbors[k].a ] == 2 ) { subgraph.AddEdge( chosen[j], 1, neighbors[k].a, neighbors[k].b, true ) ; //printf( "subgraph: (%d %d)=>(%d %d)\n", chosen[j], 1, neighbors[k].a, neighbors[k].b ) ; } } } // Initialize the degree counter for ( int j = 0 ; j < chosenCnt ; ++j ) { for ( int l = 0 ; l < 2 ; ++l ) { /*if ( i == 6145 ) { std::vector<struct _pair> neighbors ; ncnt = subgraph.GetNeighbors( chosen[j], l, neighbors ) ; printf( "%d ncnt=%d\n", l, ncnt ) ; }*/ ncnt = subgraph.GetNeighbors( chosen[j], l, neighbors, MAX_NEIGHBOR ) ; degree[ 2 * chosen[j] + l ] = ncnt ; } } // "topological" sort head = 0 ; isInQueue[from] = true ; queue[0].a = from ; queue[0].b = 0 ; tail = 1 ; int prevTag = -1 ; int *prevAdd = buffer ; // reuse counter to save some memory. int *nextAdd = buffer2 ; int firstAdd = -1 ; while ( head < tail ) { int tailTag = tail ; for ( int j = head ; j < tailTag ; ++j ) { nextAdd[j] = -1 ; if ( !used[ queue[j].a ] ) { used[ queue[j].a ] = true ; if ( prevTag != -1 ) { scaffold.AddEdge( queue[ prevTag].a, 1 - queue[prevTag].b, queue[j].a, queue[j].b ) ; nextAdd[ prevTag ] = j ; /*if ( i == 639 ) printf( "(%lld %lld)=>(%lld %lld)\n", queue[ prevTag].a, 1 - queue[prevTag].b, queue[j].a, queue[j].b ) ;*/ } else firstAdd = j ; prevTag = j ; } prevAdd[j] = prevTag ; // the most recent(<=) queue id when added to scaffold. ncnt = subgraph.GetNeighbors( queue[j].a, 1 - queue[j].b, neighbors, MAX_NEIGHBOR ) ; for ( int k = 0 ; k < ncnt ; ++k ) { --degree[ 2 * neighbors[k].a + neighbors[k].b ] ; if ( degree[ 2 * neighbors[k].a + neighbors[k].b ] == 0 && !isInQueue[neighbors[k].a] ) { isInQueue[ neighbors[k].a ] = true ; queue[ tail ] = neighbors[k] ; // Interesting assignment, I think. ++tail ; /*if ( i == 639 ) printf( "pushed in queue: %d\n", neighbors[k].a ) ;*/ // Put the consecutive contigs together. struct _pair testNeighbors[ MAX_NEIGHBOR ] ; struct _pair tag ; tag = neighbors[k] ; while ( 1 ) { if ( contigGraph.GetNeighbors( tag.a, 1 - tag.b, testNeighbors, MAX_NEIGHBOR ) != 1 ) break ; int n = subgraph.GetNeighbors( tag.a, 1 - tag.b, testNeighbors, MAX_NEIGHBOR ) ; if ( n != 1 ) break ; //printf( "%d %d\n", n, testNeighbors[0].a ) ; struct _pair backNeighbors[ MAX_NEIGHBOR ] ; if ( contigGraph.GetNeighbors( testNeighbors[0].a, testNeighbors[0].b, backNeighbors, MAX_NEIGHBOR ) != 1 ) break ; n = subgraph.GetNeighbors( testNeighbors[0].a, testNeighbors[0].b, backNeighbors, MAX_NEIGHBOR ) ; if ( n != 1 ) break ; isInQueue[ testNeighbors[0].a ] = true ; queue[tail] = testNeighbors[0] ; ++tail ; /*if ( i == 639 ) printf( "pushed in queue: %d\n", testNeighbors[0].a ) ;*/ tag = testNeighbors[0] ; } } } } head = tailTag ; } // Remove the effect on the subgraph. /*if ( tail != chosenCnt ) { printf( "WARNING: not matched\n" ) ; exit( 1 ) ; }*/ for ( int j = 0 ; j < tail ; ++j ) { visitDummy[ queue[j].a ] = -1 ; counter[ queue[j].a ] = -1 ; subgraph.RemoveAdjacentEdges( queue[j].a ) ; isInQueue[ queue[j].a ] = false ; } subgraph.ResetEdgeUsed() ; // no point is picked if ( prevTag == -1 ) { continue ; } // Update the gap size prevTag = -1 ; for ( int j = 0 ; j < tail - 1 ; ++j ) { if ( genome.GetChrIdFromContigId( queue[j].a ) == genome.GetChrIdFromContigId( from ) ) prevTag = queue[j].a ; else if ( prevTag != -1 ) { struct _contig c = genome.GetContigInfo( queue[j].a ) ; gapSize[prevTag] -= ( c.end - c.start + 1) ; } } // Add the dangling contigs. Use the fact that the queue holding the contigs in the same order as in the scaffold. // 5'->3' dangling int *chosenDummy = degree ; for ( int j = tail - 1 ; j >= 0 ; --j ) { //if ( j < tail - 1 ) // continue ; chosenCnt = 0 ; //if ( queue[j].a == 0 ) // printf( "Dummy: %d %d %d\n", j, queue[j].b, 1 - queue[j].b ) ; SearchDangling( queue[j].a, queue[j].b, used, danglingTime, danglingVisitTime, contigGraph, false, chosen, chosenDummy, chosenCnt, genome ) ; ++danglingTime ; int prevTag = prevAdd[j] ; /*if ( queue[j].a == 0 ) { struct _pair neighbors[5] ; int ncnt = contigGraph.GetNeighbors( queue[j].a, 1 - queue[j].b, neighbors, 5 ) ; printf( "%d %d %d %d: %d %d\n", queue[j].b, chosenCnt, prevTag, ncnt, neighbors[0].a, used[ neighbors[0].a ] ) ; }*/ if ( prevTag == -1 ) break ; // Trim the dangling list int k = chosenCnt - 1 ; if ( j > 0 && j < tail - 1 ) { for ( k = chosenCnt - 1 ; k >= 1 ; --k ) if ( genome.GetChrIdFromContigId( chosen[k] ) != genome.GetChrIdFromContigId( chosen[k - 1] ) ) break ; } // Test the gap size int len = 0 ; for ( int l = 0 ; l <= k ; ++l ) { struct _contig c = genome.GetContigInfo( chosen[k] ) ; len += c.end - c.start + 1 ; } if ( j < tail - 1 ) { int l ; for ( l = j ; l >= 0 ; --l ) if ( genome.GetChrIdFromContigId( queue[l].a ) == genome.GetChrIdFromContigId( from ) ) break ; if ( !ignoreGap && len >= gapSize[ queue[l].a ] + 100 ) continue ; else gapSize[ queue[l].a ] -= len ; } for ( ; k >= 0 ; --k ) { used[ chosen[k] ] = true ; //printf( "Dangling 1: %d=>%d\n", queue[prevTag].a, chosen[k] ) ; scaffold.InsertNode( queue[ prevTag ].a, 1 - queue[ prevTag ].b, chosen[k], chosenDummy[k] ) ; } } // 3'->5' dangling for ( int j = 0 ; j < tail ; ++j ) { //if ( j > 0 ) // continue ; chosenCnt = 0 ; SearchDangling( queue[j].a, 1 - queue[j].b, used, danglingTime, danglingVisitTime, contigGraph, false, chosen, chosenDummy, chosenCnt, genome ) ; ++danglingTime ; int prevTag = prevAdd[j] ; int nextTag ; if ( prevTag == -1 || j <= firstAdd ) nextTag = firstAdd ; else if ( j == prevTag ) nextTag = j ; else nextTag = nextAdd[ prevTag ] ; if ( nextTag == -1 ) break ; /*if ( queue[j].a == 37549 ) { struct _pair neighbors[5] ; int ncnt = contigGraph.GetNeighbors( queue[j].a, queue[j].b, neighbors, 5 ) ; fprintf( stderr, "%d %d %d: %d %d %d: %d %d %d\n", j, queue[j].a, queue[j].b, chosenCnt, nextTag, ncnt, chosen[0], chosenDummy[0], used[ chosen[0] ] ) ; }*/ // trim the danling list int k = chosenCnt - 1 ; if ( j < tail - 1 && j > 0 ) { for ( k = chosenCnt - 1 ; k >= 1 ; --k ) if ( genome.GetChrIdFromContigId( chosen[k] ) != genome.GetChrIdFromContigId( chosen[k - 1] ) ) break ; } // Test the gap size int len = 0 ; for ( int l = 0 ; l <= k ; ++l ) { struct _contig c = genome.GetContigInfo( chosen[k] ) ; len += c.end - c.start + 1 ; } if ( j > 0 ) { int l ; for ( l = j - 1 ; l >= 0 ; --l ) // Notice the j-1 here, because we want the gap strictly before current contig if ( genome.GetChrIdFromContigId( queue[l].a ) == genome.GetChrIdFromContigId( from ) ) break ; if ( !ignoreGap && len >= gapSize[ queue[l].a ] + 100 ) continue ; else gapSize[ queue[l].a ] -= len ; } for ( ; k >= 0 ; --k ) { used[ chosen[k] ] = true ; scaffold.InsertNode( queue[nextTag].a, queue[nextTag].b, chosen[k], chosenDummy[k] ) ; //printf( "Dangling 2: %d<=%d\n", queue[nextTag].a, chosen[k] ) ; //if ( chosen[k] == 10246 ) // printf( "hi %d %d %d %d\n", j, queue[j].a, k, chosen[k] ) ; } } } //return 0 ; // Output the scaffold int id = 0 ; char infoFileName[512] ; char outputFileName[512] ; sprintf( infoFileName, "%s.info", prefix ) ; sprintf( outputFileName, "%s.fa", prefix ) ; outputFile = fopen( outputFileName, "w" ) ; infoFile = fopen( infoFileName, "w") ; memset( used, false, sizeof( bool ) * contigCnt ) ; for ( i = 0 ; i < contigCnt ; ++i ) { //printf( "%d (%s)\n", i, alignments.GetChromName( genome.GetChrIdFromContigId( i ) ) ) ; fflush( stdout ) ; /*if ( i == 10246 ) { std::vector<struct _pair> neighbors ; scaffold.GetNeighbors( i, 0, neighbors ) ; printf( "%u\n", neighbors.size() ) ; }*/ if ( used[i] ) continue ; int ncnt1 = scaffold.GetNeighbors( i, 0, neighbors, MAX_NEIGHBOR ) ; int ncnt2 = scaffold.GetNeighbors( i, 1, neighbors, MAX_NEIGHBOR ) ; if ( ncnt1 == 0 || ncnt2 == 0 ) // The end of a scaffold { fprintf( outputFile, ">scaffold_%d\n", id) ; fprintf( infoFile, ">scaffold_%d", id ) ; ++id ; int p = i ; int dummyP = 1 ; if ( ncnt1 == 0 ) dummyP = 0 ; used[i] = true ; genome.PrintContig( outputFile, i, dummyP ) ; fprintf( infoFile, " (%s %d %c)", alignments.GetChromName( genome.GetChrIdFromContigId( p ) ), p, dummyP == 0 ? '+' : '-' ) ; while ( 1 ) { ncnt = scaffold.GetNeighbors( p, 1 - dummyP, neighbors, MAX_NEIGHBOR ) ; if ( ncnt == 0 ) break ; // ncnt must be 1 int insertN = 17 ; if ( genome.GetChrIdFromContigId( p ) == genome.GetChrIdFromContigId( neighbors[0].a ) ) { struct _contig cp, cna ; cp = genome.GetContigInfo( p ) ; cna = genome.GetContigInfo( neighbors[0].a ) ; if ( p < neighbors[0].a ) insertN = cna.start - cp.end - 1 ; else if ( p > neighbors[0].a ) insertN = cp.start - cna.end - 1 ; } p = neighbors[0].a ; dummyP = neighbors[0].b ; for ( int j = 0 ; j < insertN ; ++j ) fprintf( outputFile, "N" ) ; used[p] = true ; genome.PrintContig( outputFile, p, dummyP ) ; fprintf( infoFile, " (%s %d %c)", alignments.GetChromName( genome.GetChrIdFromContigId( p ) ), p, dummyP == 0 ? '+' : '-' ) ; } fprintf( outputFile, "\n" ) ; fprintf( infoFile, "\n" ) ; } } for ( i = 0 ; i < contigCnt ; ++i ) if ( !used[i] ) { fprintf( stderr, "Unreported contig %d.\n", i ) ; } fclose( outputFile ) ; fclose( infoFile ) ; delete[] buffer ; delete[] buffer2 ; delete[] chosen ; delete[] queue ; delete[] counter ; delete[] visitTime ; delete[] used ; delete[] scafInfo ; delete[] isInQueue ; delete[] gapSize ; //fclose( rascafFile ) ; return 0 ; }
int main( int argc, char *argv[] ) { int i, j, k ; FILE *fp ; char buffer[10100] ; std::vector<std::string> fields ; int binSize = 50 ; const char *backboneName = NULL ; // the compatilibity between the alignment and allele // The likelihood of this read from this allele struct _compatible **compatibility ; int **snpAllele ; // whether a snp showed up in the allele. Alignments alignments ; alignments.Open( argv[2] ) ; for ( i = 3 ; i < argc ; ++i ) { if ( !strcmp( argv[i], "-b" ) ) { backboneName = argv[i + 1] ; alignments.OnlyChrom( backboneName ) ; ++i ; } else { fprintf( stderr, "Unknown argument %s.\n", argv[i] ) ; exit( 1 ) ; } } // Parse the files associate with the snps // Firstly, read in the snp list have the information sprintf( buffer, "%s.snp", argv[1] ) ; fp = fopen( buffer, "r" ) ; k = 0 ; while ( fgets( buffer, sizeof( buffer ), fp ) ) { Split( buffer, '\t', fields ) ; if ( backboneName && strcmp( fields[2].c_str(), backboneName ) ) continue ; snpNameToId[ fields[0] ] = k ; struct _snpInfo info ; info.type = fields[1][0] ; if ( info.type == 'd' ) { info.position = atoi( fields[3].c_str() ) ; info.length = atoi( fields[4].c_str() ) ; } else if ( info.type == 'i' ) { info.position = atoi( fields[3].c_str() ) ; info.length = strlen( fields[4].c_str() ) ; } else { info.position = atoi( fields[3].c_str() ) ; // notice that the snp file is 0-based index. info.length = 1 ; info.nucleotide = fields[4][0] ; } snpInfo.push_back( info ) ; std::vector< int > tmpList ; snpLink.push_back( tmpList ) ; for ( int p = 0 ; p < info.length ; ++p ) { if ( info.type != 'i' || p == 0 ) { if ( positionToSnp.find( info.position + p ) == positionToSnp.end() ) { positionToSnp[ info.position + p] = tmpList ; } positionToSnp[ info.position + p].push_back( k ) ; } } ++k ; } fclose( fp ) ; // Read in the link file. Determine the id of alleles and the association // of alleles and snps. // TODO: obtain the length of each allele and take the length into account in the statistical model // Add the id for the backbound int backboneLength = 0 ; sprintf( buffer, "%s_backbone.fa", argv[1] ) ; fp = fopen( buffer, "r" ) ; /*for ( i = 1 ; buffer[i] && buffer[i] != ' ' && buffer[i] != '\n' ; ++i ) ; buffer[i] = '\0' ; std::string backboneName( buffer + 1 ) ; alleleNameToId[ backboneName ] = 0 ; alleleIdToName.push_back( backboneName ) ;*/ bool start = false ; while ( fgets( buffer, sizeof( buffer ), fp ) ) { if ( buffer[0] == '>' ) { for ( i = 1 ; buffer[i] && buffer[i] != ' ' && buffer[i] != '\n' ; ++i ) ; buffer[i] = '\0' ; if ( !strcmp( backboneName, buffer + 1 ) ) { start = true ; } else if ( start ) break ; } if ( start && buffer[0] != '>' ) { int len = strlen( buffer ) ; if ( buffer[len - 1 ] == '\n' ) backboneLength += len - 1 ; else backboneLength += len ; } } fclose( fp ) ; /*k = 0 ; if ( k == 0 ) { std::vector<int> tmpList ; alleleSnpList.push_back( tmpList ) ; alleleLength.push_back( backboneLength ) ; }*/ // scanning the link file sprintf( buffer, "%s.link", argv[1] ) ; fp = fopen( buffer, "r" ) ; k = 0 ; while ( fgets( buffer, sizeof( buffer ), fp ) ) { std::vector<std::string> tmpFields ; Split( buffer, '\t', tmpFields ) ; // skip the snps from other backbones if ( snpNameToId.find( tmpFields[0] ) == snpNameToId.end() ) continue ; int snpId = snpNameToId[ tmpFields[0] ] ; Split( tmpFields[1].c_str(), ' ', fields ) ; int size = fields.size() ; for ( i = 0 ; i < size ; ++i ) { if ( alleleNameToId.find( fields[i] ) == alleleNameToId.end() ) { //printf( "%s %d\n", fields[i].c_str(), k ) ; alleleNameToId[ fields[i] ] = k ; alleleIdToName.push_back( fields[i] ) ; std::vector<int> tmpList ; alleleSnpList.push_back( tmpList ) ; alleleLength.push_back( backboneLength ) ; ++k ; } int alleleId = alleleNameToId[ fields[i] ] ; //if ( snpId == 118 ) // printf( "%s: %s %d\n", tmpFields[0].c_str(), fields[i].c_str(), alleleId ) ; snpLink[ snpId ].push_back( alleleId ) ; alleleSnpList[ alleleId ].push_back( snpId ) ; if ( snpInfo[ snpId ].type == 'd' ) { alleleLength[ alleleId ] -= snpInfo[ snpId ].length ; } else if ( snpInfo[ snpId ].type == 'i' ) { alleleLength[ alleleId ] += snpInfo[ snpId ].length ; } } } fclose( fp ) ; int numOfAllele = alleleIdToName.size() ; int numOfSnps = snpLink.size() ; snpAllele = new int* [numOfSnps] ; for ( i = 0 ; i < numOfSnps ; ++i ) { snpAllele[i] = new int[numOfAllele] ; memset( snpAllele[i], 0, sizeof( int ) * numOfAllele ) ; } for ( i = 0 ; i < numOfSnps ; ++i ) { int size = snpLink[i].size() ; for ( j = 0 ; j < size ; ++j ) { snpAllele[i][ snpLink[i][j] ] = 1 ; } } // Compute the compatbility score for each alignment and the allele // Get the number of alignment int numOfAlignments = 0 ; while ( alignments.Next() ) ++numOfAlignments ; alignments.Rewind() ; compatibility = new struct _compatible*[numOfAlignments] ; for ( i = 0 ; i < numOfAlignments ; ++i ) { compatibility[i] = new struct _compatible[ numOfAllele ] ; for ( j = 0 ; j < numOfAllele ; ++j ) { compatibility[i][j].value = 0 ;//-log( (double)alleleLength[j] ) / log( 10.0 ); } } i = 0 ; bool *snpHit = new bool[ numOfSnps ] ; while ( alignments.Next() ) { struct _pair coord = alignments.segments[0] ; alignmentCoords.push_back( coord ) ; memset( snpHit, 0, sizeof( bool ) * numOfSnps ) ; Split( alignments.GetFieldZ( "Zs" ), ',', fields ) ; int size = fields.size() ; for ( k = 0 ; k < size ; ++k ) { std::vector<std::string> subfields ; Split( fields[k].c_str(), '|', subfields ) ; int snpId = snpNameToId[ subfields[2] ] ; snpHit[ snpId ] = true ; } for ( k = coord.a ; k <= coord.b ; ++k ) { int size = positionToSnp[k].size() ; for ( int l = 0 ; l < size ; ++l ) { // if this SNP is hit. Then other allele don't have this snp // will deduct its likelihood //TODO: the deduction can be based on the quality score of the read int tag = 0 ; int snpId = positionToSnp[k][l] ; if ( snpHit[ snpId ] ) { tag = 0 ; } else { // if this SNP is not hit, then every allele containing this snp // will deduct its likelihood tag = 1 ; } for ( j = 0 ; j < numOfAllele ; ++j ) { if ( snpAllele[ snpId ][j] == tag ) { int v = -2 ; //if ( snpInfo[ snpId ].type == 'd' || snpInfo[ snpId ].type == 'i' ) // v = -4 * snpInfo[ snpId ].length ; if ( snpInfo[ snpId ].type == 'd' && snpInfo[ snpId ].position < k && k != coord.a ) { // The penality has already been subtracted. v = 0 ; } compatibility[i][j].value += v ; /*if ( i == 8 && j == 78 ) { printf( "Bad snp %d: %d %d\n", tag, k, positionToSnp[k][l] ) ; }*/ } } } } ++i ; } //printf( "%d %d\n", numOfAlignments, numOfAllele ) ; // Now, let's consider every pair of alleles, and compute its log likelihood double **logLikelihood ; logLikelihood = new double *[ numOfAllele] ; for ( j = 0 ; j < numOfAllele ; ++j ) { logLikelihood[j] = new double[ numOfAllele ] ; //memset( logLikelihood[j], 0, sizeof( double ) * numOfAllele ) ; for ( k = 0 ; k < numOfAllele ; ++k ) logLikelihood[j][k] = 0 ; } int prevBin = -1 ; double assignJBin = 0 ; double assignKBin = 0 ; for ( j = 0 ; j < numOfAllele ; ++j ) { for ( k = j ; k < numOfAllele ; ++k ) { double binAdjust = 0 ; double averageRead = ( (double)numOfAlignments ) / (double)( alleleLength[j] + alleleLength[k] ) * binSize ; for ( i = 0 ; i < numOfAlignments ; ++i ) { double vj = compatibility[i][j].value ; double vk = compatibility[i][k].value ; double weightJ = 0, weightK = 0 ; if ( vj == vk ) { weightJ = weightK = 0.5 ; } else if ( vj == vk + 2 ) { if ( vj == 0 ) { weightJ = 1 ; } else { weightJ = 0.99 ; weightK = 0.01 ; } } else if ( vk == vj + 2 ) { if ( vk == 0 ) { weightK = 1 ; } else { weightJ = 0.01 ; weightK = 0.99 ; } } else { if ( vk > vj ) weightK = 1 ; else weightJ = 1 ; } double l = weightJ * compatibility[i][j].value + weightK * compatibility[i][k].value ; if ( alignmentCoords[i].a / binSize != prevBin ) { if ( prevBin != -1 && ( assignJBin > averageRead + 4 * sqrt( averageRead ) || assignKBin > averageRead + 4 * sqrt( averageRead ) ) ) { //if ( j == 8 && k == 78 ) // printf( "%lf: %lf %lf %d %d\n", averageRead, assignJBin, assignKBin, alleleLength[j], alleleLength[k] ) ; binAdjust -= 4 ; } prevBin = alignmentCoords[i].a / binSize ; assignJBin = 0 ; assignKBin = 0 ; } assignJBin += weightJ ; assignKBin += weightK ; /*if ( j == 8 && k == 78 && l < 0 ) { printf( "Bad alignment %d (%s %s). %lf %lf: %lf\n", i, alleleIdToName[j].c_str(), alleleIdToName[k].c_str(), compatibility[i][j].value, compatibility[i][k].value, l ) ; }*/ logLikelihood[j][k] += l ; } logLikelihood[j][k] += ( -log( (double)alleleLength[j] ) / log(10.0 ) - log( (double)alleleLength[k] ) / log(10.0) ) ; logLikelihood[j][k] += binAdjust ; } } // Find the result double max ; int maxj = -1 ; int maxk = -1 ; std::vector< struct _result > results ; for ( j = 0 ; j < numOfAllele ; ++j ) { for ( k = j ; k < numOfAllele ; ++k ) { if ( maxj == -1 || logLikelihood[j][k] > max ) { maxj = j ; maxk = k ; max = logLikelihood[j][k] ; } struct _result r ; r.a = j ; r.b = k ; r.logLikelihood = logLikelihood[j][k] ; results.push_back( r ) ; } } //printf( "%s %s %lf\n", alleleIdToName[ maxj ].c_str(), alleleIdToName[ maxk ].c_str(), max) ; //printf( "%lf\n", logLikelihood[124][128] ) ; if ( results.size() == 0 ) { printf( "-1 -1 -1\n" ) ; exit( 1 ) ; } std::sort( results.begin(), results.end(), CompResult ) ; i = 0 ; printf( "%s %s %lf\n", alleleIdToName[ results[i].a ].c_str(), alleleIdToName[ results[i].b ].c_str(), results[i].logLikelihood ) ; k = results.size() ; for ( i = 1 ; i < k ; ++i ) { if ( results[i].logLikelihood != results[0].logLikelihood ) break ; printf( "%s %s %lf\n", alleleIdToName[ results[i].a ].c_str(), alleleIdToName[ results[i].b ].c_str(), results[i].logLikelihood ) ; } return 0 ; }