util/UTF8: new library

d5cf41e0 · Max Kellermann · b7a1954c · d5cf41e0 · d5cf41e0 · d5cf41e0
Commit d5cf41e0 authored Oct 10, 2014 by Max Kellermann
Hide whitespace changes
Inline Side-by-side

Showing with 340 additions and 0 deletions

Makefile.am Makefile.am +1 -0

UTF8.cxx src/util/UTF8.cxx +265 -0

UTF8.hxx src/util/UTF8.hxx +74 -0

No files found.
--- a/Makefile.am
+++ b/Makefile.am
@@ -366,6 +366,7 @@ libutil_a_SOURCES = \
 	src/util/Domain.hxx \
 	src/util/ReusableArray.hxx \
 	src/util/ASCII.hxx \
+	src/util/UTF8.cxx src/util/UTF8.hxx \
 	src/util/CharUtil.hxx \
 	src/util/NumberParser.hxx \
 	src/util/StringUtil.cxx src/util/StringUtil.hxx \

--- a/src/util/UTF8.cxx
+++ b/src/util/UTF8.cxx
+/*
+ * Copyright (C) 2011-2014 Max Kellermann <max@duempel.org>
+ * http://www.musicpd.org
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * FOUNDATION OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "UTF8.hxx"
+#include "CharUtil.hxx"
+#include <algorithm>
+/**
+ * Is this a leading byte that is followed by 1 continuation byte?
+ */
+static constexpr bool
+IsLeading1(unsigned char ch)
+{
+	return (ch & 0xe0) == 0xc0;
+}
+static constexpr unsigned char
+MakeLeading1(unsigned char value)
+{
+	return 0xc0 | value;
+}
+/**
+ * Is this a leading byte that is followed by 2 continuation byte?
+ */
+static constexpr bool
+IsLeading2(unsigned char ch)
+{
+	return (ch & 0xf0) == 0xe0;
+}
+static constexpr unsigned char
+MakeLeading2(unsigned char value)
+{
+	return 0xe0 | value;
+}
+/**
+ * Is this a leading byte that is followed by 3 continuation byte?
+ */
+static constexpr bool
+IsLeading3(unsigned char ch)
+{
+	return (ch & 0xf8) == 0xf0;
+}
+static constexpr unsigned char
+MakeLeading3(unsigned char value)
+{
+	return 0xf0 | value;
+}
+/**
+ * Is this a leading byte that is followed by 4 continuation byte?
+ */
+static constexpr bool
+IsLeading4(unsigned char ch)
+{
+	return (ch & 0xfc) == 0xf8;
+}
+static constexpr unsigned char
+MakeLeading4(unsigned char value)
+{
+	return 0xf8 | value;
+}
+/**
+ * Is this a leading byte that is followed by 5 continuation byte?
+ */
+static constexpr bool
+IsLeading5(unsigned char ch)
+{
+	return (ch & 0xfe) == 0xfc;
+}
+static constexpr unsigned char
+MakeLeading5(unsigned char value)
+{
+	return 0xfc | value;
+}
+static constexpr bool
+IsContinuation(unsigned char ch)
+{
+	return (ch & 0xc0) == 0x80;
+}
+/**
+ * Generate a continuation byte of the low 6 bit.
+ */
+static constexpr unsigned char
+MakeContinuation(unsigned char value)
+{
+	return 0x80 | (value & 0x3f);
+}
+bool
+ValidateUTF8(const char *p)
+{
+	for (; *p != 0; ++p) {
+		unsigned char ch = *p;
+		if (IsASCII(ch))
+			continue;
+		if (IsContinuation(ch))
+			/* continuation without a prefix */
+			return false;
+		if (IsLeading1(ch)) {
+			/* 1 continuation */
+			if (!IsContinuation(*++p))
+				return false;
+		} else if (IsLeading2(ch)) {
+			/* 2 continuations */
+			if (!IsContinuation(*++p) || !IsContinuation(*++p))
+				return false;
+		} else if (IsLeading3(ch)) {
+			/* 3 continuations */
+			if (!IsContinuation(*++p) || !IsContinuation(*++p) ||
+			    !IsContinuation(*++p))
+				return false;
+		} else if (IsLeading4(ch)) {
+			/* 4 continuations */
+			if (!IsContinuation(*++p) || !IsContinuation(*++p) ||
+			    !IsContinuation(*++p) || !IsContinuation(*++p))
+				return false;
+		} else if (IsLeading5(ch)) {
+			/* 5 continuations */
+			if (!IsContinuation(*++p) || !IsContinuation(*++p) ||
+			    !IsContinuation(*++p) || !IsContinuation(*++p) ||
+			    !IsContinuation(*++p))
+				return false;
+		} else
+			return false;
+	}
+	return true;
+}
+static const char *
+FindNonASCIIOrZero(const char *p)
+{
+  while (*p != 0 && IsASCII(*p))
+    ++p;
+  return p;
+}
+const char *
+Latin1ToUTF8(const char *gcc_restrict src, char *gcc_restrict buffer,
+	     size_t buffer_size)
+{
+	const char *p = FindNonASCIIOrZero(src);
+	if (*p == 0)
+		/* everything is plain ASCII, we don't need to convert anything */
+		return src;
+	if ((size_t)(p - src) >= buffer_size)
+		/* buffer too small */
+		return nullptr;
+	const char *const end = buffer + buffer_size;
+	char *q = std::copy(src, p, buffer);
+	while (*p != 0) {
+		unsigned char ch = *p++;
+		if (IsASCII(ch)) {
+			*q++ = ch;
+			if (q >= end)
+				/* buffer too small */
+				return nullptr;
+		} else {
+			if (q + 2 >= end)
+				/* buffer too small */
+				return nullptr;
+			*q++ = MakeLeading1(ch >> 6);
+			*q++ = MakeContinuation(ch);
+		}
+	}
+	*q = 0;
+	return buffer;
+}
+char *
+UnicodeToUTF8(unsigned ch, char *q)
+{
+  if (gcc_likely(ch < 0x80)) {
+    *q++ = (char)ch;
+  } else if (gcc_likely(ch < 0x800)) {
+    *q++ = MakeLeading1(ch >> 6);
+    *q++ = MakeContinuation(ch);
+  } else if (ch < 0x10000) {
+    *q++ = MakeLeading2(ch >> 12);
+    *q++ = MakeContinuation(ch >> 6);
+    *q++ = MakeContinuation(ch);
+  } else if (ch < 0x200000) {
+    *q++ = MakeLeading3(ch >> 18);
+    *q++ = MakeContinuation(ch >> 12);
+    *q++ = MakeContinuation(ch >> 6);
+    *q++ = MakeContinuation(ch);
+  } else if (ch < 0x4000000) {
+    *q++ = MakeLeading4(ch >> 24);
+    *q++ = MakeContinuation(ch >> 18);
+    *q++ = MakeContinuation(ch >> 12);
+    *q++ = MakeContinuation(ch >> 6);
+    *q++ = MakeContinuation(ch);
+  } else if (ch < 0x80000000) {
+    *q++ = MakeLeading5(ch >> 30);
+    *q++ = MakeContinuation(ch >> 24);
+    *q++ = MakeContinuation(ch >> 18);
+    *q++ = MakeContinuation(ch >> 12);
+    *q++ = MakeContinuation(ch >> 6);
+    *q++ = MakeContinuation(ch);
+  } else {
+    // error
+  }
+  return q;
+}
+size_t
+LengthUTF8(const char *p)
+{
+	/* this is a very naive implementation: it does not do any
+	   verification, it just counts the bytes that are not a UTF-8
+	   continuation */
+	size_t n = 0;
+	for (; *p != 0; ++p)
+		if (!IsContinuation(*p))
+			++n;
+	return n;
+}
--- a/src/util/UTF8.hxx
+++ b/src/util/UTF8.hxx
+/*
+ * Copyright (C) 2011-2014 Max Kellermann <max@duempel.org>
+ * http://www.musicpd.org
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * FOUNDATION OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef UTF8_HXX
+#define UTF8_HXX
+#include "Compiler.h"
+#include <stddef.h>
+/**
+ * Is this a valid UTF-8 string?
+ */
+gcc_pure gcc_nonnull_all
+bool
+ValidateUTF8(const char *p);
+/**
+ * Convert the specified string from ISO-8859-1 to UTF-8.
+ *
+ * @return the UTF-8 version of the source string; may return #src if
+ * there are no non-ASCII characters; returns nullptr if the destination
+ * buffer is too small
+ */
+gcc_pure  gcc_nonnull_all
+const char *
+Latin1ToUTF8(const char *src, char *buffer, size_t buffer_size);
+/**
+ * Convert the specified Unicode character to UTF-8 and write it to
+ * the buffer.  buffer must have a length of at least 6!
+ *
+ * @return a pointer to the buffer plus the added bytes(s)
+ */
+gcc_nonnull_all
+char *
+UnicodeToUTF8(unsigned ch, char *buffer);
+/**
+ * Returns the number of characters in the string.  This is different
+ * from strlen(), which counts the number of bytes.
+ */
+gcc_pure gcc_nonnull_all
+size_t
+LengthUTF8(const char *p);
+#endif