void
FortranCUDASubroutinesGeneration::createSubroutines ()
{
  using std::string;
  using std::map;

  CUDAconstants->appendConstantInitialisationToModule ( moduleScope, declarations, /* isCuda = */ true );
  
  /*
   * ======================================================
   * This vector contains all subroutines called by user
   * kernels, to avoid their duplication in the output
   * file
   * ======================================================
   */
  vector < SgProcedureHeaderStatement * > allCalledRoutines;

  for (map <string, ParallelLoop *>::const_iterator it =
      declarations->firstParallelLoop (); it
      != declarations->lastParallelLoop (); ++it)
  {
    string const userSubroutineName = it->first;

    Debug::getInstance ()->debugMessage ("Analysing user subroutine '"
        + userSubroutineName + "' with subroutines already defined in previous kernels (number = "
        + boost::lexical_cast<string> (allCalledRoutines.size()) + ": ", Debug::FUNCTION_LEVEL, __FILE__, __LINE__);

    vector < SgProcedureHeaderStatement * > :: iterator finder;
    for ( finder = allCalledRoutines.begin (); finder != allCalledRoutines.end (); finder++ )
    {
      Debug::getInstance ()->debugMessage ("Routine: '" + (*finder)->get_name ().getString () + "'", Debug::FUNCTION_LEVEL, __FILE__, __LINE__);
    }
        
        
    FortranParallelLoop * parallelLoop =
        static_cast <FortranParallelLoop *> (it->second);

    FortranCUDAUserSubroutine * userDeviceSubroutine =
        new FortranCUDAUserSubroutine (moduleScope, parallelLoop, declarations);

    userDeviceSubroutine->createStatements ();        
        
    CUDAconstants->patchReferencesToConstants (
        userDeviceSubroutine->getSubroutineHeaderStatement ());

    /*
     * ======================================================
     * We have to set each node in the AST representation of
     * this subroutine as compiler generated, otherwise chunks
     * of the user kernel are missing in the output
     * ======================================================
     */

    RoseHelper::forceOutputOfCodeToFile (
        userDeviceSubroutine->getSubroutineHeaderStatement ());
        

    /*
     * ======================================================
     * When the user subroutine has calls to other user
     * subroutines we need to add them to the generated file
     * This call also eliminates automatically the duplicates
     * of routines already contained in allCalledRoutines
     * ======================================================
     */
    userDeviceSubroutine->appendAdditionalSubroutines (moduleScope, parallelLoop, declarations, CUDAconstants, &allCalledRoutines);
        
//    vector < FortranUserSubroutine * > additionalSubroutines = userDeviceSubroutine->getAdditionalSubroutines ();
    
    /*
     * ======================================================
     * Appending new routines to the list of all routines
     * appended
     * ======================================================
     */    
    //allCalledRoutines.insert(allCalledRoutines.end(), additionalSubroutines.begin(), additionalSubroutines.end());
    
    FortranCUDAKernelSubroutine * kernelSubroutine;

    if (parallelLoop->isDirectLoop ())
    {
      kernelSubroutine
          = new FortranCUDAKernelSubroutineDirectLoop (
              moduleScope,
              userDeviceSubroutine,
              parallelLoop,
              reductionSubroutines,
              cardinalitiesDeclarations[userSubroutineName],
              dimensionsDeclarations[userSubroutineName],
              static_cast <FortranCUDAModuleDeclarations *> (moduleDeclarations[userSubroutineName]));

      hostSubroutines[userSubroutineName]
          = new FortranCUDAHostSubroutineDirectLoop (
              moduleScope,
              kernelSubroutine,
              parallelLoop,
              cardinalitiesDeclarations[userSubroutineName],
              dimensionsDeclarations[userSubroutineName],
              static_cast <FortranCUDAModuleDeclarations *> (moduleDeclarations[userSubroutineName]));
    }
    else
    {
      kernelSubroutine
          = new FortranCUDAKernelSubroutineIndirectLoop (
              moduleScope,
              userDeviceSubroutine,
              parallelLoop,
              reductionSubroutines,
              static_cast <FortranCUDAOpDatCardinalitiesDeclarationIndirectLoop *> (cardinalitiesDeclarations[userSubroutineName]),
              dimensionsDeclarations[userSubroutineName],
              static_cast <FortranCUDAModuleDeclarations *> (moduleDeclarations[userSubroutineName]));

      hostSubroutines[userSubroutineName]
          = new FortranCUDAHostSubroutineIndirectLoop (
              moduleScope,
              kernelSubroutine,
              parallelLoop,
              static_cast <FortranCUDAOpDatCardinalitiesDeclarationIndirectLoop *> (cardinalitiesDeclarations[userSubroutineName]),
              dimensionsDeclarations[userSubroutineName],
              static_cast <FortranCUDAModuleDeclarations *> (moduleDeclarations[userSubroutineName]));
    }
  }
}