commit 554a9fe2e9d12defd9d6253871d8261d3f3ef3c6
parent 7b93d02cd8f26ab9a25d967c72c359a22c91eb74
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Sat, 9 Jan 2021 14:56:51 +0100
utf8pad: improve padded printing and printing invalid unicode characters
- Use unicode replacement character (codepoint 0xfffd) when a codepoint is
invalid and proceed printing the rest of the characters.
- When a codepoint is invalid reset the internal state of mbtowc(3), from the
OpenBSD man page:
" If a call to mbtowc() resulted in an undefined internal state, mbtowc()
must be called with s set to NULL to reset the internal state before it
can safely be used again."
- Make the function return 0 when `len` is 0 (this should not be not an error).
Diffstat:
2 files changed, 83 insertions(+), 34 deletions(-)
diff --git a/stagit-gopher-index.c b/stagit-gopher-index.c
@@ -10,6 +10,9 @@
#include <git2.h>
+#define PAD_TRUNCATE_SYMBOL "\xe2\x80\xa6" /* symbol: "ellipsis" */
+#define UTF_INVALID_SYMBOL "\xef\xbf\xbd" /* symbol: "replacement" */
+
static git_repository *repo;
static const char *relpath = "";
@@ -17,40 +20,62 @@ static const char *relpath = "";
static char description[255] = "Repositories";
static char *name = "";
-/* format `len' columns of characters. If string is shorter pad the rest
+/* Format `len' columns of characters. If string is shorter pad the rest
* with characters `pad`. */
int
utf8pad(char *buf, size_t bufsiz, const char *s, size_t len, int pad)
{
wchar_t wc;
size_t col = 0, i, slen, siz = 0;
- int rl, w;
+ int inc, rl, w;
- if (!len)
+ if (!bufsiz)
return -1;
+ if (!len) {
+ buf[0] = '\0';
+ return 0;
+ }
slen = strlen(s);
- for (i = 0; i < slen; i += rl) {
- if ((rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4)) <= 0)
- break;
- if ((w = wcwidth(wc)) == -1)
+ for (i = 0; i < slen; i += inc) {
+ inc = 1;
+ if ((unsigned char)s[i] < 32)
continue;
- if (col + w > len || (col + w == len && s[i + rl])) {
+
+ rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4);
+ if (rl < 0) {
+ mbtowc(NULL, NULL, 0); /* reset state */
+ inc = 1; /* next byte */
+ w = 1; /* replacement char is one width */
+ } else if ((w = wcwidth(wc)) == -1) {
+ continue;
+ } else {
+ inc = rl;
+ }
+
+ if (col + w > len || (col + w == len && s[i + inc])) {
if (siz + 4 >= bufsiz)
return -1;
- memcpy(&buf[siz], "\xe2\x80\xa6", 3);
- siz += 3;
- if (col + w == len && w > 1)
- buf[siz++] = pad;
+ memcpy(&buf[siz], PAD_TRUNCATE_SYMBOL, sizeof(PAD_TRUNCATE_SYMBOL) - 1);
+ siz += sizeof(PAD_TRUNCATE_SYMBOL) - 1;
buf[siz] = '\0';
- return 0;
+ col++;
+ break;
+ } else if (rl < 0) {
+ if (siz + 4 >= bufsiz)
+ return -1;
+ memcpy(&buf[siz], UTF_INVALID_SYMBOL, sizeof(UTF_INVALID_SYMBOL) - 1);
+ siz += sizeof(UTF_INVALID_SYMBOL) - 1;
+ buf[siz] = '\0';
+ col++;
+ continue;
}
- if (siz + rl + 1 >= bufsiz)
+ if (siz + inc + 1 >= bufsiz)
return -1;
- memcpy(&buf[siz], &s[i], rl);
- col += w;
- siz += rl;
+ memcpy(&buf[siz], &s[i], inc);
+ siz += inc;
buf[siz] = '\0';
+ col += w;
}
len -= col;
diff --git a/stagit-gopher.c b/stagit-gopher.c
@@ -19,6 +19,8 @@
#include "compat.h"
#define LEN(s) (sizeof(s)/sizeof(*s))
+#define PAD_TRUNCATE_SYMBOL "\xe2\x80\xa6" /* symbol: "ellipsis" */
+#define UTF_INVALID_SYMBOL "\xef\xbf\xbd" /* symbol: "replacement" */
struct deltainfo {
git_patch *patch;
@@ -80,40 +82,62 @@ static char lastoidstr[GIT_OID_HEXSZ + 2]; /* id + newline + NUL byte */
static FILE *rcachefp, *wcachefp;
static const char *cachefile;
-/* format `len' columns of characters. If string is shorter pad the rest
+/* Format `len' columns of characters. If string is shorter pad the rest
* with characters `pad`. */
int
utf8pad(char *buf, size_t bufsiz, const char *s, size_t len, int pad)
{
wchar_t wc;
size_t col = 0, i, slen, siz = 0;
- int rl, w;
+ int inc, rl, w;
- if (!len)
+ if (!bufsiz)
return -1;
+ if (!len) {
+ buf[0] = '\0';
+ return 0;
+ }
slen = strlen(s);
- for (i = 0; i < slen; i += rl) {
- if ((rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4)) <= 0)
- break;
- if ((w = wcwidth(wc)) == -1)
+ for (i = 0; i < slen; i += inc) {
+ inc = 1;
+ if ((unsigned char)s[i] < 32)
+ continue;
+
+ rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4);
+ if (rl < 0) {
+ mbtowc(NULL, NULL, 0); /* reset state */
+ inc = 1; /* next byte */
+ w = 1; /* replacement char is one width */
+ } else if ((w = wcwidth(wc)) == -1) {
continue;
- if (col + w > len || (col + w == len && s[i + rl])) {
+ } else {
+ inc = rl;
+ }
+
+ if (col + w > len || (col + w == len && s[i + inc])) {
if (siz + 4 >= bufsiz)
return -1;
- memcpy(&buf[siz], "\xe2\x80\xa6", 3);
- siz += 3;
- if (col + w == len && w > 1)
- buf[siz++] = pad;
+ memcpy(&buf[siz], PAD_TRUNCATE_SYMBOL, sizeof(PAD_TRUNCATE_SYMBOL) - 1);
+ siz += sizeof(PAD_TRUNCATE_SYMBOL) - 1;
buf[siz] = '\0';
- return 0;
+ col++;
+ break;
+ } else if (rl < 0) {
+ if (siz + 4 >= bufsiz)
+ return -1;
+ memcpy(&buf[siz], UTF_INVALID_SYMBOL, sizeof(UTF_INVALID_SYMBOL) - 1);
+ siz += sizeof(UTF_INVALID_SYMBOL) - 1;
+ buf[siz] = '\0';
+ col++;
+ continue;
}
- if (siz + rl + 1 >= bufsiz)
+ if (siz + inc + 1 >= bufsiz)
return -1;
- memcpy(&buf[siz], &s[i], rl);
- col += w;
- siz += rl;
+ memcpy(&buf[siz], &s[i], inc);
+ siz += inc;
buf[siz] = '\0';
+ col += w;
}
len -= col;