tildefriends/deps/libuv/src/idna.c
2023-11-07 17:30:39 +00:00

556 lines
12 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/* Copyright libuv contributors. All rights reserved.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
/* Derived from https://github.com/bnoordhuis/punycode
* but updated to support IDNA 2008.
*/
#include "uv.h"
#include "uv-common.h"
#include "idna.h"
#include <assert.h>
#include <string.h>
#include <limits.h> /* UINT_MAX */
static int32_t uv__wtf8_decode1(const char** input) {
uint32_t code_point;
uint8_t b1;
uint8_t b2;
uint8_t b3;
uint8_t b4;
b1 = **input;
if (b1 <= 0x7F)
return b1; /* ASCII code point */
if (b1 < 0xC2)
return -1; /* invalid: continuation byte */
code_point = b1;
b2 = *++*input;
if ((b2 & 0xC0) != 0x80)
return -1; /* invalid: not a continuation byte */
code_point = (code_point << 6) | (b2 & 0x3F);
if (b1 <= 0xDF)
return 0x7FF & code_point; /* two-byte character */
b3 = *++*input;
if ((b3 & 0xC0) != 0x80)
return -1; /* invalid: not a continuation byte */
code_point = (code_point << 6) | (b3 & 0x3F);
if (b1 <= 0xEF)
return 0xFFFF & code_point; /* three-byte character */
b4 = *++*input;
if ((b4 & 0xC0) != 0x80)
return -1; /* invalid: not a continuation byte */
code_point = (code_point << 6) | (b4 & 0x3F);
if (b1 <= 0xF4) {
code_point &= 0x1FFFFF;
if (code_point <= 0x10FFFF)
return code_point; /* four-byte character */
}
/* code point too large */
return -1;
}
static unsigned uv__utf8_decode1_slow(const char** p,
const char* pe,
unsigned a) {
unsigned b;
unsigned c;
unsigned d;
unsigned min;
if (a > 0xF7)
return -1;
switch (pe - *p) {
default:
if (a > 0xEF) {
min = 0x10000;
a = a & 7;
b = (unsigned char) *(*p)++;
c = (unsigned char) *(*p)++;
d = (unsigned char) *(*p)++;
break;
}
/* Fall through. */
case 2:
if (a > 0xDF) {
min = 0x800;
b = 0x80 | (a & 15);
c = (unsigned char) *(*p)++;
d = (unsigned char) *(*p)++;
a = 0;
break;
}
/* Fall through. */
case 1:
if (a > 0xBF) {
min = 0x80;
b = 0x80;
c = 0x80 | (a & 31);
d = (unsigned char) *(*p)++;
a = 0;
break;
}
/* Fall through. */
case 0:
return -1; /* Invalid continuation byte. */
}
if (0x80 != (0xC0 & (b ^ c ^ d)))
return -1; /* Invalid sequence. */
b &= 63;
c &= 63;
d &= 63;
a = (a << 18) | (b << 12) | (c << 6) | d;
if (a < min)
return -1; /* Overlong sequence. */
if (a > 0x10FFFF)
return -1; /* Four-byte sequence > U+10FFFF. */
if (a >= 0xD800 && a <= 0xDFFF)
return -1; /* Surrogate pair. */
return a;
}
unsigned uv__utf8_decode1(const char** p, const char* pe) {
unsigned a;
assert(*p < pe);
a = (unsigned char) *(*p)++;
if (a < 128)
return a; /* ASCII, common case. */
return uv__utf8_decode1_slow(p, pe, a);
}
static int uv__idna_toascii_label(const char* s, const char* se,
char** d, char* de) {
static const char alphabet[] = "abcdefghijklmnopqrstuvwxyz0123456789";
const char* ss;
unsigned c;
unsigned h;
unsigned k;
unsigned n;
unsigned m;
unsigned q;
unsigned t;
unsigned x;
unsigned y;
unsigned bias;
unsigned delta;
unsigned todo;
int first;
h = 0;
ss = s;
todo = 0;
/* Note: after this loop we've visited all UTF-8 characters and know
* they're legal so we no longer need to check for decode errors.
*/
while (s < se) {
c = uv__utf8_decode1(&s, se);
if (c == UINT_MAX)
return UV_EINVAL;
if (c < 128)
h++;
else
todo++;
}
/* Only write "xn--" when there are non-ASCII characters. */
if (todo > 0) {
if (*d < de) *(*d)++ = 'x';
if (*d < de) *(*d)++ = 'n';
if (*d < de) *(*d)++ = '-';
if (*d < de) *(*d)++ = '-';
}
/* Write ASCII characters. */
x = 0;
s = ss;
while (s < se) {
c = uv__utf8_decode1(&s, se);
assert(c != UINT_MAX);
if (c > 127)
continue;
if (*d < de)
*(*d)++ = c;
if (++x == h)
break; /* Visited all ASCII characters. */
}
if (todo == 0)
return h;
/* Only write separator when we've written ASCII characters first. */
if (h > 0)
if (*d < de)
*(*d)++ = '-';
n = 128;
bias = 72;
delta = 0;
first = 1;
while (todo > 0) {
m = -1;
s = ss;
while (s < se) {
c = uv__utf8_decode1(&s, se);
assert(c != UINT_MAX);
if (c >= n)
if (c < m)
m = c;
}
x = m - n;
y = h + 1;
if (x > ~delta / y)
return UV_E2BIG; /* Overflow. */
delta += x * y;
n = m;
s = ss;
while (s < se) {
c = uv__utf8_decode1(&s, se);
assert(c != UINT_MAX);
if (c < n)
if (++delta == 0)
return UV_E2BIG; /* Overflow. */
if (c != n)
continue;
for (k = 36, q = delta; /* empty */; k += 36) {
t = 1;
if (k > bias)
t = k - bias;
if (t > 26)
t = 26;
if (q < t)
break;
/* TODO(bnoordhuis) Since 1 <= t <= 26 and therefore
* 10 <= y <= 35, we can optimize the long division
* into a table-based reciprocal multiplication.
*/
x = q - t;
y = 36 - t; /* 10 <= y <= 35 since 1 <= t <= 26. */
q = x / y;
t = t + x % y; /* 1 <= t <= 35 because of y. */
if (*d < de)
*(*d)++ = alphabet[t];
}
if (*d < de)
*(*d)++ = alphabet[q];
delta /= 2;
if (first) {
delta /= 350;
first = 0;
}
/* No overflow check is needed because |delta| was just
* divided by 2 and |delta+delta >= delta + delta/h|.
*/
h++;
delta += delta / h;
for (bias = 0; delta > 35 * 26 / 2; bias += 36)
delta /= 35;
bias += 36 * delta / (delta + 38);
delta = 0;
todo--;
}
delta++;
n++;
}
return 0;
}
ssize_t uv__idna_toascii(const char* s, const char* se, char* d, char* de) {
const char* si;
const char* st;
unsigned c;
char* ds;
int rc;
ds = d;
si = s;
while (si < se) {
st = si;
c = uv__utf8_decode1(&si, se);
if (c == UINT_MAX)
return UV_EINVAL;
if (c != '.')
if (c != 0x3002) /* 。 */
if (c != 0xFF0E) /* */
if (c != 0xFF61) /* 。 */
continue;
rc = uv__idna_toascii_label(s, st, &d, de);
if (rc < 0)
return rc;
if (d < de)
*d++ = '.';
s = si;
}
if (s < se) {
rc = uv__idna_toascii_label(s, se, &d, de);
if (rc < 0)
return rc;
}
if (d < de)
*d++ = '\0';
return d - ds; /* Number of bytes written. */
}
ssize_t uv_wtf8_length_as_utf16(const char* source_ptr) {
size_t w_target_len = 0;
int32_t code_point;
do {
code_point = uv__wtf8_decode1(&source_ptr);
if (code_point < 0)
return -1;
if (code_point > 0xFFFF)
w_target_len++;
w_target_len++;
} while (*source_ptr++);
return w_target_len;
}
void uv_wtf8_to_utf16(const char* source_ptr,
uint16_t* w_target,
size_t w_target_len) {
int32_t code_point;
do {
code_point = uv__wtf8_decode1(&source_ptr);
/* uv_wtf8_length_as_utf16 should have been called and checked first. */
assert(code_point >= 0);
if (code_point > 0x10000) {
assert(code_point < 0x10FFFF);
*w_target++ = (((code_point - 0x10000) >> 10) + 0xD800);
*w_target++ = ((code_point - 0x10000) & 0x3FF) + 0xDC00;
w_target_len -= 2;
} else {
*w_target++ = code_point;
w_target_len -= 1;
}
} while (*source_ptr++);
assert(w_target_len == 0);
}
static int32_t uv__get_surrogate_value(const uint16_t* w_source_ptr,
ssize_t w_source_len) {
uint16_t u;
uint16_t next;
u = w_source_ptr[0];
if (u >= 0xD800 && u <= 0xDBFF && w_source_len != 1) {
next = w_source_ptr[1];
if (next >= 0xDC00 && next <= 0xDFFF)
return 0x10000 + ((u - 0xD800) << 10) + (next - 0xDC00);
}
return u;
}
size_t uv_utf16_length_as_wtf8(const uint16_t* w_source_ptr,
ssize_t w_source_len) {
size_t target_len;
int32_t code_point;
target_len = 0;
while (w_source_len) {
code_point = uv__get_surrogate_value(w_source_ptr, w_source_len);
/* Can be invalid UTF-8 but must be valid WTF-8. */
assert(code_point >= 0);
if (w_source_len < 0 && code_point == 0)
break;
if (code_point < 0x80)
target_len += 1;
else if (code_point < 0x800)
target_len += 2;
else if (code_point < 0x10000)
target_len += 3;
else {
target_len += 4;
w_source_ptr++;
if (w_source_len > 0)
w_source_len--;
}
w_source_ptr++;
if (w_source_len > 0)
w_source_len--;
}
return target_len;
}
int uv_utf16_to_wtf8(const uint16_t* w_source_ptr,
ssize_t w_source_len,
char** target_ptr,
size_t* target_len_ptr) {
size_t target_len;
char* target;
char* target_end;
int32_t code_point;
/* If *target_ptr is provided, then *target_len_ptr must be its length
* (excluding space for NUL), otherwise we will compute the target_len_ptr
* length and may return a new allocation in *target_ptr if target_ptr is
* provided. */
if (target_ptr == NULL || *target_ptr == NULL) {
target_len = uv_utf16_length_as_wtf8(w_source_ptr, w_source_len);
if (target_len_ptr != NULL)
*target_len_ptr = target_len;
} else {
target_len = *target_len_ptr;
}
if (target_ptr == NULL)
return 0;
if (*target_ptr == NULL) {
target = uv__malloc(target_len + 1);
if (target == NULL) {
return UV_ENOMEM;
}
*target_ptr = target;
} else {
target = *target_ptr;
}
target_end = target + target_len;
while (target != target_end && w_source_len) {
code_point = uv__get_surrogate_value(w_source_ptr, w_source_len);
/* Can be invalid UTF-8 but must be valid WTF-8. */
assert(code_point >= 0);
if (w_source_len < 0 && code_point == 0) {
w_source_len = 0;
break;
}
if (code_point < 0x80) {
*target++ = code_point;
} else if (code_point < 0x800) {
*target++ = 0xC0 | (code_point >> 6);
if (target == target_end)
break;
*target++ = 0x80 | (code_point & 0x3F);
} else if (code_point < 0x10000) {
*target++ = 0xE0 | (code_point >> 12);
if (target == target_end)
break;
*target++ = 0x80 | ((code_point >> 6) & 0x3F);
if (target == target_end)
break;
*target++ = 0x80 | (code_point & 0x3F);
} else {
*target++ = 0xF0 | (code_point >> 18);
if (target == target_end)
break;
*target++ = 0x80 | ((code_point >> 12) & 0x3F);
if (target == target_end)
break;
*target++ = 0x80 | ((code_point >> 6) & 0x3F);
if (target == target_end)
break;
*target++ = 0x80 | (code_point & 0x3F);
/* uv__get_surrogate_value consumed 2 input characters */
w_source_ptr++;
if (w_source_len > 0)
w_source_len--;
}
target_len = target - *target_ptr;
w_source_ptr++;
if (w_source_len > 0)
w_source_len--;
}
if (target != target_end && target_len_ptr != NULL)
/* Did not fill all of the provided buffer, so update the target_len_ptr
* output with the space used. */
*target_len_ptr = target - *target_ptr;
/* Check if input fit into target exactly. */
if (w_source_len < 0 && target == target_end && w_source_ptr[0] == 0)
w_source_len = 0;
*target++ = '\0';
/* Characters remained after filling the buffer, compute the remaining length now. */
if (w_source_len) {
if (target_len_ptr != NULL)
*target_len_ptr = target_len + uv_utf16_length_as_wtf8(w_source_ptr, w_source_len);
return UV_ENOBUFS;
}
return 0;
}