const char *ohcount_detect_language(SourceFile *sourcefile) { const char *language = NULL; char *p, *pe; int length; // Attempt to detect based on file extension. length = strlen(sourcefile->ext); struct ExtensionMap *re = ohcount_hash_language_from_ext(sourcefile->ext, length); if (re) language = re->value; if (language == NULL) { // Try the lower-case version of this extension. char lowerext[length + 1]; strncpy(lowerext, sourcefile->ext, length); lowerext[length] = '\0'; for (p = lowerext; p < lowerext + length; p++) *p = tolower(*p); struct ExtensionMap *re = ohcount_hash_language_from_ext(lowerext, length); if (re) return re->value; } if (language) { if (ISAMBIGUOUS(language)) { // Call the appropriate function for disambiguation. length = strlen(DISAMBIGUATEWHAT(language)); struct DisambiguateFuncsMap *rd = ohcount_hash_disambiguate_func_from_id(DISAMBIGUATEWHAT(language), length); if (rd) return rd->value(sourcefile); } else return ISBINARY(language) ? NULL : language; } // Attempt to detect based on filename. length = strlen(sourcefile->filename); struct FilenameMap *rf = ohcount_hash_language_from_filename(sourcefile->filename, length); if (rf) return rf->value; char line[81] = { '\0' }, buf[81]; // Attempt to detect using Emacs mode line (/^-\*-\s*mode[\s:]*\w/i). p = ohcount_sourcefile_get_contents(sourcefile); pe = p; char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile); while (pe < eof) { // Get the contents of the first line. while (pe < eof && *pe != '\r' && *pe != '\n') pe++; length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line); strncpy(line, p, length); line[length] = '\0'; if (*line == '#' && *(line + 1) == '!') { // First line was sh-bang; loop to get contents of second line. while (*pe == '\r' || *pe == '\n') pe++; p = pe; } else break; } char *eol = line + strlen(line); for (p = line; p < eol; p++) *p = tolower(*p); p = strstr(line, "-*-"); if (p) { p += 3; while (*p == ' ' || *p == '\t') p++; if (strncmp(p, "mode", 4) == 0) { p += 4; while (*p == ' ' || *p == '\t' || *p == ':') p++; } pe = p; while (isalnum(*pe)) pe++; length = pe - p; strncpy(buf, p, length); buf[length] = '\0'; struct LanguageMap *rl = ohcount_hash_language_from_name(buf, length); if (rl) return rl->name; } // Attempt to detect based on Unix 'file' command. int tmpfile = 0; char *path = sourcefile->filepath; if (sourcefile->diskpath) path = sourcefile->diskpath; if (access(path, F_OK) != 0) { // create temporary file path = malloc(21); strncpy(path, "/tmp/ohcount_XXXXXXX", 20); *(path + 21) = '\0'; int fd = mkstemp(path); char *contents = ohcount_sourcefile_get_contents(sourcefile); log_it("contents:"); log_it(contents); length = contents ? strlen(contents) : 0; write(fd, contents, length); close(fd); tmpfile = 1; } char command[strlen(path) + 11]; sprintf(command, "file -b '%s'", path); FILE *f = popen(command, "r"); if (f) { fgets(line, sizeof(line), f); char *eol = line + strlen(line); for (p = line; p < eol; p++) *p = tolower(*p); p = strstr(line, "script text"); if (p && p == line) { // /^script text(?: executable)? for \w/ p = strstr(line, "for "); if (p) { p += 4; pe = p; while (isalnum(*pe)) pe++; length = pe - p; strncpy(buf, p, length); buf[length] = '\0'; struct LanguageMap *rl = ohcount_hash_language_from_name(buf, length); if (rl) language = rl->name; } } else if (p) { // /(\w+)(?: -\w+)* script text/ do { p--; pe = p; while (*p == ' ') p--; while (p != line && isalnum(*(p - 1))) p--; if (p != line && *(p - 1) == '-') p--; } while (*p == '-'); // Skip over any switches. length = pe - p; strncpy(buf, p, length); buf[length] = '\0'; struct LanguageMap *rl = ohcount_hash_language_from_name(buf, length); if (rl) language = rl->name; } else if (strstr(line, "xml")) language = LANG_XML; pclose(f); if (tmpfile) { remove(path); free(path); } if (language) return language; } return NULL; }
/* Use all available means to detect file language */ const char *ohcount_detect_language(SourceFile *sourcefile) { const char *language = NULL; char *p, *pe; int length; // Attempt to detect using Emacs mode line (/^-\*-\s*mode[\s:]*\w/i). char line[81] = { '\0' }, buf[81]; p = ohcount_sourcefile_get_contents(sourcefile); pe = p; char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile); while (pe < eof) { // Get the contents of the first line. while (pe < eof && *pe != '\r' && *pe != '\n') pe++; length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line); strncpy(line, p, length); line[length] = '\0'; if (*line == '#' && *(line + 1) == '!') { // First line was sh-bang; loop to get contents of second line. while (*pe == '\r' || *pe == '\n') pe++; p = pe; } else break; } p = strstr(line, "-*-"); if (p) { p += 3; while (*p == ' ' || *p == '\t') p++; // detect "mode" (any capitalization) if (strncasecmp(p, "mode", 4) == 0) { p += 4; while (*p == ' ' || *p == '\t' || *p == ':') p++; } pe = p; while (!isspace(*pe) && *pe != ';' && pe != strstr(pe, "-*-")) pe++; length = (pe - p <= sizeof(buf)) ? pe - p : sizeof(buf); strncpy(buf, p, length); buf[length] = '\0'; // Special case for "c" or "C" emacs mode header: always means C, not C++ if (strcasecmp(buf, "c") == 0) { return LANG_C; } // First try it with the language name. struct LanguageMap *rl = ohcount_hash_language_from_name(buf, length); if (rl) language = rl->name; if(!language) { // Then try it with the extension table. struct ExtensionMap *re = ohcount_hash_language_from_ext(buf, length); if (re) language = re->value; } if (!language) { // Try the lower-case version of this modeline. for (pe = buf; pe < buf+length; pe++) *pe = tolower(*pe); // First try it with the language name. rl = ohcount_hash_language_from_name(buf, length); if (rl) language = rl->name; } if (!language) { // Then try it with the extension table. struct ExtensionMap *re = ohcount_hash_language_from_ext(buf, length); if (re) language = re->value; } } // Attempt to detect based on file extension. if(!language) { length = strlen(sourcefile->ext); struct ExtensionMap *re = ohcount_hash_language_from_ext(sourcefile->ext, length); if (re) language = re->value; if (!language) { // Try the lower-case version of this extension. char lowerext[length + 1]; strncpy(lowerext, sourcefile->ext, length); lowerext[length] = '\0'; for (p = lowerext; p < lowerext + length; p++) *p = tolower(*p); struct ExtensionMap *re = ohcount_hash_language_from_ext(lowerext, length); if (re) language = re->value; } } // Attempt to detect based on filename. if(!language) { length = strlen(sourcefile->filename); struct FilenameMap *rf = ohcount_hash_language_from_filename(sourcefile->filename, length); if (rf) language = rf->value; } // Attempt to detect based on Unix 'file' command. if(!language) { language = detect_language_magic(sourcefile); } if (language) { if (ISAMBIGUOUS(language)) { // Call the appropriate function for disambiguation. length = strlen(DISAMBIGUATEWHAT(language)); struct DisambiguateFuncsMap *rd = ohcount_hash_disambiguate_func_from_id(DISAMBIGUATEWHAT(language), length); if (rd) language = rd->value(sourcefile); } else language = ISBINARY(language) ? NULL : language; } return language; }