char *reencode_string(const char *in, const char *out_encoding, const char *in_encoding) { iconv_t conv; char *out; if (!in_encoding) return NULL; conv = iconv_open(out_encoding, in_encoding); if (conv == (iconv_t) -1) { /* * Some platforms do not have the variously spelled variants of * UTF-8, so let's fall back to trying the most official * spelling. We do so only as a fallback in case the platform * does understand the user's spelling, but not our official * one. */ if (is_encoding_utf8(in_encoding)) in_encoding = "UTF-8"; if (is_encoding_utf8(out_encoding)) out_encoding = "UTF-8"; conv = iconv_open(out_encoding, in_encoding); if (conv == (iconv_t) -1) return NULL; } out = reencode_string_iconv(in, strlen(in), conv); iconv_close(conv); return out; }
static char *replace_encoding_header(char *buf, const char *encoding) { struct strbuf tmp = STRBUF_INIT; size_t start, len; char *cp = buf; /* guess if there is an encoding header before a \n\n */ while (strncmp(cp, "encoding ", strlen("encoding "))) { cp = strchr(cp, '\n'); if (!cp || *++cp == '\n') return buf; } start = cp - buf; cp = strchr(cp, '\n'); if (!cp) return buf; /* should not happen but be defensive */ len = cp + 1 - (buf + start); strbuf_attach(&tmp, buf, strlen(buf), strlen(buf) + 1); if (is_encoding_utf8(encoding)) { /* we have re-coded to UTF-8; drop the header */ strbuf_remove(&tmp, start, len); } else { /* just replaces XXXX in 'encoding XXXX\n' */ strbuf_splice(&tmp, start + strlen("encoding "), len - strlen("encoding \n"), encoding, strlen(encoding)); } return strbuf_detach(&tmp, NULL); }
/* * Returns first character length in bytes for multi-byte `text` according to * `encoding`. * * - The `text` pointer is updated to point at the next character. * - When `remainder_p` is not NULL, on entry `*remainder_p` is how much bytes * we can consume from text, and on exit `*remainder_p` is reduced by returned * character length. Otherwise `text` is treated as limited by NUL. */ int mbs_chrlen(const char **text, size_t *remainder_p, const char *encoding) { int chrlen; const char *p = *text; size_t r = (remainder_p ? *remainder_p : SIZE_MAX); if (r < 1) return 0; if (is_encoding_utf8(encoding)) { pick_one_utf8_char(&p, &r); chrlen = p ? (p - *text) : 1 /* not valid UTF-8 -> raw byte sequence */; } else { /* * TODO use iconv to decode one char and obtain its chrlen * for now, let's treat encodings != UTF-8 as one-byte */ chrlen = 1; } *text += chrlen; if (remainder_p) *remainder_p -= chrlen; return chrlen; }
/* * When there is no known charset, guess. * * Right now we assume that if the target is UTF-8 (the default), * and it already looks like UTF-8 (which includes US-ASCII as its * subset, of course) then that is what it is and there is nothing * to do. * * Otherwise, we default to assuming it is Latin1 for historical * reasons. */ static const char *guess_charset(const struct strbuf *line, const char *target_charset) { if (is_encoding_utf8(target_charset)) { if (is_utf8(line->buf)) return NULL; } return "ISO8859-1"; }
/* * When there is no known charset, guess. * * Right now we assume that if the target is UTF-8 (the default), * and it already looks like UTF-8 (which includes US-ASCII as its * subset, of course) then that is what it is and there is nothing * to do. * * Otherwise, we default to assuming it is Latin1 for historical * reasons. */ static const char *guess_charset(const char *line, const char *target_charset) { if (is_encoding_utf8(target_charset)) { if (is_utf8(line)) return NULL; } return "latin1"; }
/* If ENCODING is the name of an encoding that could begin with a byte-order mark, and in fact the N bytes in DATA do begin with a byte-order mark, returns the number of bytes in the byte-order mark. Otherwise, returns 0. N must be at least ENCODING_GUESS_MIN, unless the file is shorter than that. */ size_t encoding_guess_bom_length (const char *encoding, const void *data_, size_t n) { const uint8_t *data = data_; return (is_utf8_bom (data, n) && is_encoding_utf8 (encoding) ? 3 : is_utf16le_bom (data, n) && is_encoding_utf16 (encoding) ? 2 : is_utf16be_bom (data, n) && is_encoding_utf16 (encoding) ? 2 : is_utf32le_bom (data, n) && is_encoding_utf32 (encoding) ? 4 : is_utf32be_bom (data, n) && is_encoding_utf32 (encoding) ? 4 : 0); }
/* * Create a commit object in the repository using the current * index and the information from the provided rev_commit */ static int git_commit(rev_commit *commit) { cvs_author *author; char *full; char *email; char *log; unsigned char commit_sha1[20]; size_t size = 0; int encoding_is_utf8; if (!commit->sha1) return 0; log = git_log(commit); if (!log) return 0; author = git_fullname(commit->author); if (!author) { // fprintf (stderr, "%s: not in author map\n", commit->author); full = commit->author; email = commit->author; } else { full = author->full; email = author->email; } /* Not having i18n.commitencoding is the same as having utf-8 */ encoding_is_utf8 = is_encoding_utf8(git_commit_encoding); add_buffer(&size, "tree %s\n", commit->sha1); if (commit->parent) add_buffer(&size, "parent %s\n", commit->parent->sha1); add_buffer(&size, "author %s <%s> %lu +0000\n", full, email, commit->date); add_buffer(&size, "committer %s <%s> %lu +0000\n", full, email, commit->date); if (!encoding_is_utf8) add_buffer(&size, "encoding %s\n", git_commit_encoding); add_buffer(&size, "\n%s", log); if (write_sha1_file(commit_text, size, commit_type, commit_sha1)) return 0; commit->sha1 = atom(sha1_to_hex(commit_sha1)); if (!commit->sha1) return 0; return 1; }
/* Returns an encoding guess based on ENCODING and the N bytes of text starting at DATA. DATA should start with the first non-ASCII text character (as determined by encoding_guess_is_ascii_text()) found in the input. The return value will either be "UTF-8" or the fallback encoding for ENCODING. See encoding-guesser.h for intended use of this function. N must be at least ENCODING_GUESS_MIN, unless the file has fewer bytes than that starting with the first non-ASCII text character. */ const char * encoding_guess_tail_encoding (const char *encoding, const void *data, size_t n) { if (encoding_guess_tail_is_utf8 (data, n) != 0) return "UTF-8"; else { /* The data is not UTF-8. */ const char *fallback_encoding = encoding_guess_parse_encoding (encoding); /* If the fallback encoding is UTF-8, fall back on something else.*/ if (is_encoding_utf8 (fallback_encoding)) return "windows-1252"; return fallback_encoding; } }
/* * Fill the given strbuf with the notes associated with the given object. * * If the given notes_tree structure is not initialized, it will be auto- * initialized to the default value (see documentation for init_notes() above). * If the given notes_tree is NULL, the internal/default notes_tree will be * used instead. * * (raw != 0) gives the %N userformat; otherwise, the note message is given * for human consumption. */ static void format_note(struct notes_tree *t, const struct object_id *object_oid, struct strbuf *sb, const char *output_encoding, int raw) { static const char utf8[] = "utf-8"; const struct object_id *oid; char *msg, *msg_p; unsigned long linelen, msglen; enum object_type type; if (!t) t = &default_notes_tree; if (!t->initialized) init_notes(t, NULL, NULL, 0); oid = get_note(t, object_oid); if (!oid) return; if (!(msg = read_object_file(oid, &type, &msglen)) || type != OBJ_BLOB) { free(msg); return; } if (output_encoding && *output_encoding && !is_encoding_utf8(output_encoding)) { char *reencoded = reencode_string(msg, output_encoding, utf8); if (reencoded) { free(msg); msg = reencoded; msglen = strlen(msg); } } /* we will end the annotation by a newline anyway */ if (msglen && msg[msglen - 1] == '\n') msglen--; if (!raw) { const char *ref = t->ref; if (!ref || !strcmp(ref, GIT_NOTES_DEFAULT_REF)) { strbuf_addstr(sb, "\nNotes:\n"); } else { if (starts_with(ref, "refs/")) ref += 5; if (starts_with(ref, "notes/")) ref += 6; strbuf_addf(sb, "\nNotes (%s):\n", ref); } } for (msg_p = msg; msg_p < msg + msglen; msg_p += linelen + 1) { linelen = strchrnul(msg_p, '\n') - msg_p; if (!raw) strbuf_addstr(sb, " "); strbuf_add(sb, msg_p, linelen); strbuf_addch(sb, '\n'); } free(msg); }
static void handle_commit(struct commit *commit, struct rev_info *rev) { int saved_output_format = rev->diffopt.output_format; const char *commit_buffer; const char *author, *author_end, *committer, *committer_end; const char *encoding, *message; char *reencoded = NULL; struct commit_list *p; const char *refname; int i; rev->diffopt.output_format = DIFF_FORMAT_CALLBACK; parse_commit_or_die(commit); commit_buffer = get_commit_buffer(commit, NULL); author = strstr(commit_buffer, "\nauthor "); if (!author) die ("Could not find author in commit %s", sha1_to_hex(commit->object.sha1)); author++; author_end = strchrnul(author, '\n'); committer = strstr(author_end, "\ncommitter "); if (!committer) die ("Could not find committer in commit %s", sha1_to_hex(commit->object.sha1)); committer++; committer_end = strchrnul(committer, '\n'); message = strstr(committer_end, "\n\n"); encoding = find_encoding(committer_end, message); if (message) message += 2; if (commit->parents && get_object_mark(&commit->parents->item->object) != 0 && !full_tree) { parse_commit_or_die(commit->parents->item); diff_tree_sha1(commit->parents->item->tree->object.sha1, commit->tree->object.sha1, "", &rev->diffopt); } else diff_root_tree_sha1(commit->tree->object.sha1, "", &rev->diffopt); /* Export the referenced blobs, and remember the marks. */ for (i = 0; i < diff_queued_diff.nr; i++) if (!S_ISGITLINK(diff_queued_diff.queue[i]->two->mode)) export_blob(diff_queued_diff.queue[i]->two->sha1); refname = commit->util; if (anonymize) { refname = anonymize_refname(refname); anonymize_ident_line(&committer, &committer_end); anonymize_ident_line(&author, &author_end); } mark_next_object(&commit->object); if (anonymize) reencoded = anonymize_commit_message(message); else if (!is_encoding_utf8(encoding)) reencoded = reencode_string(message, "UTF-8", encoding); if (!commit->parents) printf("reset %s\n", refname); printf("commit %s\nmark :%"PRIu32"\n%.*s\n%.*s\ndata %u\n%s", refname, last_idnum, (int)(author_end - author), author, (int)(committer_end - committer), committer, (unsigned)(reencoded ? strlen(reencoded) : message ? strlen(message) : 0), reencoded ? reencoded : message ? message : ""); free(reencoded); unuse_commit_buffer(commit, commit_buffer); for (i = 0, p = commit->parents; p; p = p->next) { int mark = get_object_mark(&p->item->object); if (!mark) continue; if (i == 0) printf("from :%d\n", mark); else printf("merge :%d\n", mark); i++; } if (full_tree) printf("deleteall\n"); log_tree_diff_flush(rev); rev->diffopt.output_format = saved_output_format; printf("\n"); show_progress(); }
int same_encoding(const char *src, const char *dst) { if (is_encoding_utf8(src) && is_encoding_utf8(dst)) return 1; return !strcasecmp(src, dst); }
/* Attempts to guess the encoding of a text file based on ENCODING, an encoding name in one of the forms described at the top of encoding-guesser.h, and DATA, which contains the first N bytes of the file. Returns the guessed encoding, which might be ENCODING itself or a suffix of it or a statically allocated string. Encoding autodetection only takes place if ENCODING actually specifies autodetection. See encoding-guesser.h for details. UTF-8 cannot be distinguished from other ASCII-based encodings until a non-ASCII text character is encountered. If ENCODING specifies autodetection and this function returns "ASCII", then the client should process the input until it encounters an non-ASCII character (as returned by encoding_guess_is_ascii_text()) and then use encoding_guess_tail_encoding() to make a final encoding guess. See encoding-guesser.h for details. N must be at least ENCODING_GUESS_MIN, unless the file is shorter than that. */ const char * encoding_guess_head_encoding (const char *encoding, const void *data_, size_t n) { const uint8_t *data = data_; const char *fallback_encoding; const char *guess; fallback_encoding = encoding_guess_parse_encoding (encoding); if (!encoding_guess_encoding_is_auto (encoding)) return fallback_encoding; if (n == 0) return fallback_encoding; if (is_utf32be_bom (data, n) || is_utf32le_bom (data, n)) return "UTF-32"; if (n >= 4) { uint32_t x = get_be32 (data); if (x == 0x84319533) return "GB-18030"; else if (x == 0xdd736673) return "UTF-EBCDIC"; } if (is_utf16be_bom (data, n) || is_utf16le_bom (data, n)) return "UTF-16"; if (is_utf8_bom (data, n)) return "UTF-8"; guess = guess_utf16 (data, n); if (guess != NULL) return guess; if (is_utf32 (data, n, get_be32)) return "UTF-32BE"; if (is_utf32 (data, n, get_le32)) return "UTF-32LE"; /* We've tried all the "giveaways" that make the encoding obvious. That rules out, incidentally, all the encodings with multibyte units (e.g. UTF-16, UTF-32). Our remaining goal is to try to distinguish UTF-8 from some ASCII-based fallback encoding. */ /* If the fallback encoding isn't ASCII compatible, give up. */ if (!is_encoding_ascii_compatible (fallback_encoding)) return fallback_encoding; /* If the data we have clearly is not UTF-8, give up. */ if (!encoding_guess_tail_is_utf8 (data, n)) { /* If the fallback encoding is UTF-8, fall back on something else.*/ if (is_encoding_utf8 (fallback_encoding)) return "windows-1252"; return fallback_encoding; } return "ASCII"; }
int cmd_commit(int argc, const char **argv, const char *prefix) { int header_len; struct strbuf sb; const char *index_file, *reflog_msg; char *nl, *p; unsigned char commit_sha1[20]; struct ref_lock *ref_lock; git_config(git_commit_config); argc = parse_and_validate_options(argc, argv, builtin_commit_usage); index_file = prepare_index(argc, argv, prefix); /* Set up everything for writing the commit object. This includes running hooks, writing the trees, and interacting with the user. */ if (!prepare_to_commit(index_file, prefix)) { rollback_index_files(); return 1; } /* * The commit object */ strbuf_init(&sb, 0); strbuf_addf(&sb, "tree %s\n", sha1_to_hex(active_cache_tree->sha1)); /* Determine parents */ if (initial_commit) { reflog_msg = "commit (initial)"; } else if (amend) { struct commit_list *c; struct commit *commit; reflog_msg = "commit (amend)"; commit = lookup_commit(head_sha1); if (!commit || parse_commit(commit)) die("could not parse HEAD commit"); for (c = commit->parents; c; c = c->next) add_parent(&sb, c->item->object.sha1); } else if (in_merge) { struct strbuf m; FILE *fp; reflog_msg = "commit (merge)"; add_parent(&sb, head_sha1); strbuf_init(&m, 0); fp = fopen(git_path("MERGE_HEAD"), "r"); if (fp == NULL) die("could not open %s for reading: %s", git_path("MERGE_HEAD"), strerror(errno)); while (strbuf_getline(&m, fp, '\n') != EOF) { unsigned char sha1[20]; if (get_sha1_hex(m.buf, sha1) < 0) die("Corrupt MERGE_HEAD file (%s)", m.buf); add_parent(&sb, sha1); } fclose(fp); strbuf_release(&m); } else { reflog_msg = "commit"; strbuf_addf(&sb, "parent %s\n", sha1_to_hex(head_sha1)); } strbuf_addf(&sb, "author %s\n", fmt_ident(author_name, author_email, author_date, IDENT_ERROR_ON_NO_NAME)); strbuf_addf(&sb, "committer %s\n", git_committer_info(IDENT_ERROR_ON_NO_NAME)); if (!is_encoding_utf8(git_commit_encoding)) strbuf_addf(&sb, "encoding %s\n", git_commit_encoding); strbuf_addch(&sb, '\n'); /* Finally, get the commit message */ header_len = sb.len; if (strbuf_read_file(&sb, git_path(commit_editmsg), 0) < 0) { rollback_index_files(); die("could not read commit message"); } /* Truncate the message just before the diff, if any. */ p = strstr(sb.buf, "\ndiff --git a/"); if (p != NULL) strbuf_setlen(&sb, p - sb.buf + 1); if (cleanup_mode != CLEANUP_NONE) stripspace(&sb, cleanup_mode == CLEANUP_ALL); if (sb.len < header_len || message_is_empty(&sb, header_len)) { rollback_index_files(); die("no commit message? aborting commit."); } strbuf_addch(&sb, '\0'); if (is_encoding_utf8(git_commit_encoding) && !is_utf8(sb.buf)) fprintf(stderr, commit_utf8_warn); if (write_sha1_file(sb.buf, sb.len - 1, commit_type, commit_sha1)) { rollback_index_files(); die("failed to write commit object"); } ref_lock = lock_any_ref_for_update("HEAD", initial_commit ? NULL : head_sha1, 0); nl = strchr(sb.buf + header_len, '\n'); if (nl) strbuf_setlen(&sb, nl + 1 - sb.buf); else strbuf_addch(&sb, '\n'); strbuf_remove(&sb, 0, header_len); strbuf_insert(&sb, 0, reflog_msg, strlen(reflog_msg)); strbuf_insert(&sb, strlen(reflog_msg), ": ", 2); if (!ref_lock) { rollback_index_files(); die("cannot lock HEAD ref"); } if (write_ref_sha1(ref_lock, commit_sha1, sb.buf) < 0) { rollback_index_files(); die("cannot update HEAD ref"); } unlink(git_path("MERGE_HEAD")); unlink(git_path("MERGE_MSG")); unlink(git_path("SQUASH_MSG")); if (commit_index_files()) die ("Repository has been updated, but unable to write\n" "new_index file. Check that disk is not full or quota is\n" "not exceeded, and then \"git reset HEAD\" to recover."); rerere(); run_hook(get_index_file(), "post-commit", NULL); if (!quiet) print_summary(prefix, commit_sha1); return 0; }