Mercurial > libjeffpc
view unicode.c @ 812:59a473863eaa
cmake: add a way to force 64-bit build
This is useful on OSes that support 64-bit binaries but default to 32-bit.
To force 64-bits, pass FORCE_64_BIT_BUILD to cmake. E.g.,
$ cmake . -DFORCE_64_BIT_BUILD=1
Signed-off-by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
author | Josef 'Jeff' Sipek <jeffpc@josefsipek.net> |
---|---|
date | Sat, 18 Jul 2020 10:16:00 -0400 |
parents | 49bfab50b58a |
children | d563f6d72c1e |
line wrap: on
line source
/* * Copyright (c) 2018 Josef 'Jeff' Sipek <jeffpc@josefsipek.net> * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <jeffpc/unicode.h> #include <jeffpc/error.h> /* * RFC 3629: UTF-8, a transformation format of ISO 10646 */ /* * Convert a UTF-8 codepoint to a UCS4 codepoint and returned number of * consumed bytes. */ size_t utf8_to_utf32(const char *in, size_t inlen, uint32_t *out) { uint32_t mincp, maxcp; unsigned char c; uint32_t res; size_t len; size_t i; if (!inlen) return 0; /* empty input */ c = in[0]; /* decode first byte */ if ((c & 0xf8) == 0xf0) { len = 4; res = (c & 0x07) << 18; mincp = 0x010000; maxcp = 0x10ffff; } else if ((c & 0xf0) == 0xe0) { len = 3; res = (c & 0x0f) << 12; mincp = 0x0800; maxcp = 0xffff; } else if ((c & 0xe0) == 0xc0) { len = 2; res = (c & 0x1f) << 6; mincp = 0x080; maxcp = 0x7ff; } else if ((c & 0x80) == 0x00) { len = 1; res = c & 0x7f; mincp = 0x00; maxcp = 0x7f; } else { return 0; /* invalid first byte */ } if (len > inlen) return 0; /* not enough bytes of input */ for (i = 1; i < len; i++) { c = in[i]; if ((c & 0xc0) != 0x80) return 0; /* invalid additional byte */ res |= (c & 0x3f) << (6 * (len - i - 1)); } /* * final validity checks */ /* overlong sequences */ if (res < mincp) return 0; /* codepoints > U+10FFFF */ if (res > maxcp) return 0; if (!utf32_is_valid(res)) return 0; *out = res; return len; } ssize_t utf32_to_utf8(uint32_t cp, char *buf, size_t buflen) { ssize_t len; ssize_t i; if (!utf32_is_valid(cp)) return -EINVAL; /* invalid codepoint */ if (cp <= 0x7f) len = 1; else if (cp <= 0x7ff) len = 2; else if (cp <= 0xffff) len = 3; else len = 4; ASSERT3U(cp, <=, 0x10ffff); if (len > buflen) return -ENOMEM; /* not enough space */ /* fast-path for ASCII */ if (len == 1) { buf[0] = cp; return 1; } /* first byte */ buf[0] = (cp >> (6 * (len - 1))) | (0xff << (8 - len)); /* second...fourth byte */ for (i = 1; i < len; i++) buf[i] = 0x80 | ((cp >> (6 * (len - i - 1))) & 0x3f); return len; } int utf8_is_valid_str(const char *src, size_t slen) { size_t i = 0; while (i < slen) { uint32_t cp; size_t cplen; cplen = utf8_to_utf32(src + i, slen - i, &cp); if (!cplen) return -EILSEQ; i += cplen; } return 0; }