Commit b70bf938 authored by Max Kellermann's avatar Max Kellermann

util/UTF8: add SequenceLengthUTF8()

parent d5cf41e0
...@@ -166,6 +166,86 @@ ValidateUTF8(const char *p) ...@@ -166,6 +166,86 @@ ValidateUTF8(const char *p)
return true; return true;
} }
size_t
SequenceLengthUTF8(char ch)
{
if (IsASCII(ch))
return 1;
else if (IsLeading1(ch))
/* 1 continuation */
return 2;
else if (IsLeading2(ch))
/* 2 continuations */
return 3;
else if (IsLeading3(ch))
/* 3 continuations */
return 4;
else if (IsLeading4(ch))
/* 4 continuations */
return 5;
else if (IsLeading5(ch))
/* 5 continuations */
return 6;
else
/* continuation without a prefix or some other illegal
start byte */
return 0;
}
template<size_t L>
struct CheckSequenceUTF8 {
gcc_pure
bool operator()(const char *p) const {
return IsContinuation(*p) && CheckSequenceUTF8<L-1>()(p + 1);
}
};
template<>
struct CheckSequenceUTF8<0u> {
constexpr bool operator()(gcc_unused const char *p) const {
return true;
}
};
template<size_t L>
gcc_pure
static size_t
InnerSequenceLengthUTF8(const char *p)
{
return CheckSequenceUTF8<L>()(p)
? L + 1
: 0u;
}
size_t
SequenceLengthUTF8(const char *p)
{
const unsigned char ch = *p++;
if (IsASCII(ch))
return 1;
else if (IsLeading1(ch))
/* 1 continuation */
return InnerSequenceLengthUTF8<1>(p);
else if (IsLeading2(ch))
/* 2 continuations */
return InnerSequenceLengthUTF8<2>(p);
else if (IsLeading3(ch))
/* 3 continuations */
return InnerSequenceLengthUTF8<3>(p);
else if (IsLeading4(ch))
/* 4 continuations */
return InnerSequenceLengthUTF8<4>(p);
else if (IsLeading5(ch))
/* 5 continuations */
return InnerSequenceLengthUTF8<5>(p);
else
/* continuation without a prefix or some other illegal
start byte */
return 0;
}
static const char * static const char *
FindNonASCIIOrZero(const char *p) FindNonASCIIOrZero(const char *p)
{ {
......
...@@ -43,6 +43,22 @@ bool ...@@ -43,6 +43,22 @@ bool
ValidateUTF8(const char *p); ValidateUTF8(const char *p);
/** /**
* @return the number of the sequence beginning with the given
* character, or 0 if the character is not a valid start byte
*/
gcc_const
size_t
SequenceLengthUTF8(char ch);
/**
* @return the number of the first sequence in the given string, or 0
* if the sequence is malformed
*/
gcc_pure
size_t
SequenceLengthUTF8(const char *p);
/**
* Convert the specified string from ISO-8859-1 to UTF-8. * Convert the specified string from ISO-8859-1 to UTF-8.
* *
* @return the UTF-8 version of the source string; may return #src if * @return the UTF-8 version of the source string; may return #src if
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment