Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
mpd
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Registry
Registry
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Иван Мажукин
mpd
Commits
d5cf41e0
Commit
d5cf41e0
authored
Oct 10, 2014
by
Max Kellermann
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
util/UTF8: new library
parent
b7a1954c
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
340 additions
and
0 deletions
+340
-0
Makefile.am
Makefile.am
+1
-0
UTF8.cxx
src/util/UTF8.cxx
+265
-0
UTF8.hxx
src/util/UTF8.hxx
+74
-0
No files found.
Makefile.am
View file @
d5cf41e0
...
@@ -366,6 +366,7 @@ libutil_a_SOURCES = \
...
@@ -366,6 +366,7 @@ libutil_a_SOURCES = \
src/util/Domain.hxx
\
src/util/Domain.hxx
\
src/util/ReusableArray.hxx
\
src/util/ReusableArray.hxx
\
src/util/ASCII.hxx
\
src/util/ASCII.hxx
\
src/util/UTF8.cxx src/util/UTF8.hxx
\
src/util/CharUtil.hxx
\
src/util/CharUtil.hxx
\
src/util/NumberParser.hxx
\
src/util/NumberParser.hxx
\
src/util/StringUtil.cxx src/util/StringUtil.hxx
\
src/util/StringUtil.cxx src/util/StringUtil.hxx
\
...
...
src/util/UTF8.cxx
0 → 100644
View file @
d5cf41e0
/*
* Copyright (C) 2011-2014 Max Kellermann <max@duempel.org>
* http://www.musicpd.org
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* FOUNDATION OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "UTF8.hxx"
#include "CharUtil.hxx"
#include <algorithm>
/**
* Is this a leading byte that is followed by 1 continuation byte?
*/
static
constexpr
bool
IsLeading1
(
unsigned
char
ch
)
{
return
(
ch
&
0xe0
)
==
0xc0
;
}
static
constexpr
unsigned
char
MakeLeading1
(
unsigned
char
value
)
{
return
0xc0
|
value
;
}
/**
* Is this a leading byte that is followed by 2 continuation byte?
*/
static
constexpr
bool
IsLeading2
(
unsigned
char
ch
)
{
return
(
ch
&
0xf0
)
==
0xe0
;
}
static
constexpr
unsigned
char
MakeLeading2
(
unsigned
char
value
)
{
return
0xe0
|
value
;
}
/**
* Is this a leading byte that is followed by 3 continuation byte?
*/
static
constexpr
bool
IsLeading3
(
unsigned
char
ch
)
{
return
(
ch
&
0xf8
)
==
0xf0
;
}
static
constexpr
unsigned
char
MakeLeading3
(
unsigned
char
value
)
{
return
0xf0
|
value
;
}
/**
* Is this a leading byte that is followed by 4 continuation byte?
*/
static
constexpr
bool
IsLeading4
(
unsigned
char
ch
)
{
return
(
ch
&
0xfc
)
==
0xf8
;
}
static
constexpr
unsigned
char
MakeLeading4
(
unsigned
char
value
)
{
return
0xf8
|
value
;
}
/**
* Is this a leading byte that is followed by 5 continuation byte?
*/
static
constexpr
bool
IsLeading5
(
unsigned
char
ch
)
{
return
(
ch
&
0xfe
)
==
0xfc
;
}
static
constexpr
unsigned
char
MakeLeading5
(
unsigned
char
value
)
{
return
0xfc
|
value
;
}
static
constexpr
bool
IsContinuation
(
unsigned
char
ch
)
{
return
(
ch
&
0xc0
)
==
0x80
;
}
/**
* Generate a continuation byte of the low 6 bit.
*/
static
constexpr
unsigned
char
MakeContinuation
(
unsigned
char
value
)
{
return
0x80
|
(
value
&
0x3f
);
}
bool
ValidateUTF8
(
const
char
*
p
)
{
for
(;
*
p
!=
0
;
++
p
)
{
unsigned
char
ch
=
*
p
;
if
(
IsASCII
(
ch
))
continue
;
if
(
IsContinuation
(
ch
))
/* continuation without a prefix */
return
false
;
if
(
IsLeading1
(
ch
))
{
/* 1 continuation */
if
(
!
IsContinuation
(
*++
p
))
return
false
;
}
else
if
(
IsLeading2
(
ch
))
{
/* 2 continuations */
if
(
!
IsContinuation
(
*++
p
)
||
!
IsContinuation
(
*++
p
))
return
false
;
}
else
if
(
IsLeading3
(
ch
))
{
/* 3 continuations */
if
(
!
IsContinuation
(
*++
p
)
||
!
IsContinuation
(
*++
p
)
||
!
IsContinuation
(
*++
p
))
return
false
;
}
else
if
(
IsLeading4
(
ch
))
{
/* 4 continuations */
if
(
!
IsContinuation
(
*++
p
)
||
!
IsContinuation
(
*++
p
)
||
!
IsContinuation
(
*++
p
)
||
!
IsContinuation
(
*++
p
))
return
false
;
}
else
if
(
IsLeading5
(
ch
))
{
/* 5 continuations */
if
(
!
IsContinuation
(
*++
p
)
||
!
IsContinuation
(
*++
p
)
||
!
IsContinuation
(
*++
p
)
||
!
IsContinuation
(
*++
p
)
||
!
IsContinuation
(
*++
p
))
return
false
;
}
else
return
false
;
}
return
true
;
}
static
const
char
*
FindNonASCIIOrZero
(
const
char
*
p
)
{
while
(
*
p
!=
0
&&
IsASCII
(
*
p
))
++
p
;
return
p
;
}
const
char
*
Latin1ToUTF8
(
const
char
*
gcc_restrict
src
,
char
*
gcc_restrict
buffer
,
size_t
buffer_size
)
{
const
char
*
p
=
FindNonASCIIOrZero
(
src
);
if
(
*
p
==
0
)
/* everything is plain ASCII, we don't need to convert anything */
return
src
;
if
((
size_t
)(
p
-
src
)
>=
buffer_size
)
/* buffer too small */
return
nullptr
;
const
char
*
const
end
=
buffer
+
buffer_size
;
char
*
q
=
std
::
copy
(
src
,
p
,
buffer
);
while
(
*
p
!=
0
)
{
unsigned
char
ch
=
*
p
++
;
if
(
IsASCII
(
ch
))
{
*
q
++
=
ch
;
if
(
q
>=
end
)
/* buffer too small */
return
nullptr
;
}
else
{
if
(
q
+
2
>=
end
)
/* buffer too small */
return
nullptr
;
*
q
++
=
MakeLeading1
(
ch
>>
6
);
*
q
++
=
MakeContinuation
(
ch
);
}
}
*
q
=
0
;
return
buffer
;
}
char
*
UnicodeToUTF8
(
unsigned
ch
,
char
*
q
)
{
if
(
gcc_likely
(
ch
<
0x80
))
{
*
q
++
=
(
char
)
ch
;
}
else
if
(
gcc_likely
(
ch
<
0x800
))
{
*
q
++
=
MakeLeading1
(
ch
>>
6
);
*
q
++
=
MakeContinuation
(
ch
);
}
else
if
(
ch
<
0x10000
)
{
*
q
++
=
MakeLeading2
(
ch
>>
12
);
*
q
++
=
MakeContinuation
(
ch
>>
6
);
*
q
++
=
MakeContinuation
(
ch
);
}
else
if
(
ch
<
0x200000
)
{
*
q
++
=
MakeLeading3
(
ch
>>
18
);
*
q
++
=
MakeContinuation
(
ch
>>
12
);
*
q
++
=
MakeContinuation
(
ch
>>
6
);
*
q
++
=
MakeContinuation
(
ch
);
}
else
if
(
ch
<
0x4000000
)
{
*
q
++
=
MakeLeading4
(
ch
>>
24
);
*
q
++
=
MakeContinuation
(
ch
>>
18
);
*
q
++
=
MakeContinuation
(
ch
>>
12
);
*
q
++
=
MakeContinuation
(
ch
>>
6
);
*
q
++
=
MakeContinuation
(
ch
);
}
else
if
(
ch
<
0x80000000
)
{
*
q
++
=
MakeLeading5
(
ch
>>
30
);
*
q
++
=
MakeContinuation
(
ch
>>
24
);
*
q
++
=
MakeContinuation
(
ch
>>
18
);
*
q
++
=
MakeContinuation
(
ch
>>
12
);
*
q
++
=
MakeContinuation
(
ch
>>
6
);
*
q
++
=
MakeContinuation
(
ch
);
}
else
{
// error
}
return
q
;
}
size_t
LengthUTF8
(
const
char
*
p
)
{
/* this is a very naive implementation: it does not do any
verification, it just counts the bytes that are not a UTF-8
continuation */
size_t
n
=
0
;
for
(;
*
p
!=
0
;
++
p
)
if
(
!
IsContinuation
(
*
p
))
++
n
;
return
n
;
}
src/util/UTF8.hxx
0 → 100644
View file @
d5cf41e0
/*
* Copyright (C) 2011-2014 Max Kellermann <max@duempel.org>
* http://www.musicpd.org
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* FOUNDATION OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef UTF8_HXX
#define UTF8_HXX
#include "Compiler.h"
#include <stddef.h>
/**
* Is this a valid UTF-8 string?
*/
gcc_pure
gcc_nonnull_all
bool
ValidateUTF8
(
const
char
*
p
);
/**
* Convert the specified string from ISO-8859-1 to UTF-8.
*
* @return the UTF-8 version of the source string; may return #src if
* there are no non-ASCII characters; returns nullptr if the destination
* buffer is too small
*/
gcc_pure
gcc_nonnull_all
const
char
*
Latin1ToUTF8
(
const
char
*
src
,
char
*
buffer
,
size_t
buffer_size
);
/**
* Convert the specified Unicode character to UTF-8 and write it to
* the buffer. buffer must have a length of at least 6!
*
* @return a pointer to the buffer plus the added bytes(s)
*/
gcc_nonnull_all
char
*
UnicodeToUTF8
(
unsigned
ch
,
char
*
buffer
);
/**
* Returns the number of characters in the string. This is different
* from strlen(), which counts the number of bytes.
*/
gcc_pure
gcc_nonnull_all
size_t
LengthUTF8
(
const
char
*
p
);
#endif
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment