From d35944cc7d435e04df0203421eb00768345c33f6 Mon Sep 17 00:00:00 2001 From: Bryan O'Sullivan Date: Tue, 13 Mar 2012 16:21:42 -0700 Subject: [PATCH] Decode messages leniently if encoding is not known This avoids throwing of a UnicodeDecodeError on some really old commits. We also add undecoded attribute accessors, to get at the raw bytes. --- pygit2.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/pygit2.c b/pygit2.c index c4744b4..cc7e9d2 100644 --- a/pygit2.c +++ b/pygit2.c @@ -49,8 +49,14 @@ Py_LOCAL_INLINE(PyObject*) to_unicode(const char *value, const char *encoding, const char *errors) { - if (encoding == NULL) + if (encoding == NULL) { + /* If the encoding is not explicit, it may not be UTF-8, so it + * is not safe to decode it strictly. This is rare in the + * wild, but does occur in old commits to git itself + * (e.g. c31820c2). */ encoding = "utf-8"; + errors = "replace"; + } return PyUnicode_Decode(value, strlen(value), encoding, errors); } @@ -1223,6 +1229,12 @@ Commit_get_message(Commit *commit) return to_unicode(message, encoding, "strict"); } +static PyObject * +Commit_get_raw_message(Commit *commit) +{ + return PyString_FromString(git_commit_message(commit->commit)); +} + static PyObject * Commit_get_commit_time(Commit *commit) { @@ -1318,6 +1330,7 @@ static PyGetSetDef Commit_getseters[] = { {"message_encoding", (getter)Commit_get_message_encoding, NULL, "message encoding", NULL}, {"message", (getter)Commit_get_message, NULL, "message", NULL}, + {"_message", (getter)Commit_get_raw_message, NULL, "message (bytes)", NULL}, {"commit_time", (getter)Commit_get_commit_time, NULL, "commit time", NULL}, {"commit_time_offset", (getter)Commit_get_commit_time_offset, NULL, @@ -1897,12 +1910,18 @@ Tag_get_message(Tag *self) return to_unicode(message, "utf-8", "strict"); } +static PyObject * +Tag_get_raw_message(Tag *self) +{ + return PyString_FromString(git_tag_message(self->tag)); +} + static PyGetSetDef Tag_getseters[] = { {"target", (getter)Tag_get_target, NULL, "tagged object", NULL}, {"name", (getter)Tag_get_name, NULL, "tag name", NULL}, {"tagger", (getter)Tag_get_tagger, NULL, "tagger", NULL}, - {"message", (getter)Tag_get_message, NULL, "tag message", - NULL}, + {"message", (getter)Tag_get_message, NULL, "tag message", NULL}, + {"_message", (getter)Tag_get_raw_message, NULL, "tag message (bytes)", NULL}, {NULL} };