Mercurial > libjeffpc
changeset 296:1f6ae020116d
urldecode: decode url-encoded strings
Signed-off-by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
author | Josef 'Jeff' Sipek <jeffpc@josefsipek.net> |
---|---|
date | Mon, 31 Jul 2017 19:29:30 +0300 |
parents | 842d49f888f3 |
children | d7967afe6c56 |
files | .hgignore CMakeLists.txt include/jeffpc/urldecode.h jeffpc.mapfile-vers test_urldecode.c urldecode.c |
diffstat | 6 files changed, 492 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/.hgignore Sat Jul 29 13:54:07 2017 +0300 +++ b/.hgignore Mon Jul 31 19:29:30 2017 +0300 @@ -30,6 +30,7 @@ test_padding test_sexpr_parser test_sexpr_eval +test_urldecode test_version hgversion.h
--- a/CMakeLists.txt Sat Jul 29 13:54:07 2017 +0300 +++ b/CMakeLists.txt Mon Jul 31 19:29:30 2017 +0300 @@ -109,6 +109,7 @@ str.c synch.c taskq.c + urldecode.c uuid.c val.c version.c @@ -162,6 +163,7 @@ include/jeffpc/thread.h include/jeffpc/time.h include/jeffpc/types.h + include/jeffpc/urldecode.h include/jeffpc/uuid.h include/jeffpc/val.h include/jeffpc/version.h @@ -212,6 +214,7 @@ build_test_bin_and_run(nvl_pack) build_test_bin_and_run(padding) build_test_bin_and_run(sexpr_eval) +build_test_bin_and_run(urldecode) build_test_bin_and_run(version) add_subdirectory(tests)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/jeffpc/urldecode.h Mon Jul 31 19:29:30 2017 +0300 @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2010-2017 Josef 'Jeff' Sipek <jeffpc@josefsipek.net> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __JEFFPC_URLDECODE_H +#define __JEFFPC_URLDECODE_H + +extern ssize_t urldecode(const char *in, size_t len, char *out); + +#endif
--- a/jeffpc.mapfile-vers Sat Jul 29 13:54:07 2017 +0300 +++ b/jeffpc.mapfile-vers Mon Jul 31 19:29:30 2017 +0300 @@ -200,6 +200,9 @@ val_alloc_str; val_alloc_sym; + # urldecode + urldecode; + # version jeffpc_hgrev; jeffpc_hgrev_binary;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test_urldecode.c Mon Jul 31 19:29:30 2017 +0300 @@ -0,0 +1,283 @@ +/* + * Copyright (c) 2017 Josef 'Jeff' Sipek <jeffpc@josefsipek.net> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <jeffpc/types.h> +#include <jeffpc/error.h> +#include <jeffpc/urldecode.h> + +#include "test.c" + +struct test { + const char *in; + const char *out; + ssize_t inlen; + ssize_t outlen; +}; + +static const struct test input_tests[] = { + { + .in = NULL, + .inlen = 1, + .outlen = -EINVAL, + }, + { + .in = "", + .out = "", + .inlen = 0, + .outlen = 0, + }, + { + .in = "a", + .out = "a", + .inlen = 1, + .outlen = 1, + }, + { + .in = "ab", + .out = "a", + .inlen = 1, + .outlen = 1, + }, + { + .in = "+", + .out = " ", + .inlen = 1, + .outlen = 1, + }, + { + .in = "%20", + .out = " ", + .inlen = 3, + .outlen = 1, + }, + { + .in = "%", + .inlen = 1, + .outlen = -EILSEQ, + }, + { + .in = "%0", + .inlen = 1, + .outlen = -EILSEQ, + }, + { + .in = "%0", + .inlen = 2, + .outlen = -EILSEQ, + }, + { + .in = "%00", + .out = "\0", + .inlen = 3, + .outlen = 1, + }, + { + .in = "abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "0123456789", + .out = "abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "0123456789", + .inlen = 26 + 26 + 10, + .outlen = 26 + 26 + 10, + }, + { + .in = "abc+def", + .out = "abc def", + .inlen = 7, + .outlen = 7, + }, + { + .in = "abc=def", + .out = "abc=def", + .inlen = 7, + .outlen = 7, + }, + { + .in = "abc&def", + .out = "abc&def", + .inlen = 7, + .outlen = 7, + }, + { + .in = "abc%def", + .out = "abc\xde""f", + .inlen = 7, + .outlen = 5, + }, + { + .in = "abc%DEf", + .out = "abc\xde""f", + .inlen = 7, + .outlen = 5, + }, + { + .in = "abc%a0f", + .out = "abc\xa0""f", + .inlen = 7, + .outlen = 5, + }, + { + .in = "abc%A0f", + .out = "abc\xa0""f", + .inlen = 7, + .outlen = 5, + }, + { + .in = "abc%88f", + .out = "abc\x88""f", + .inlen = 7, + .outlen = 5, + }, +}; + +static const struct test arg_tests[] = { + { + .in = NULL, + .out = "", + .inlen = 0, + .outlen = -EINVAL, + }, + { + .in = "", + .out = NULL, + .inlen = 0, + .outlen = -EINVAL, + }, + { + .in = NULL, + .out = NULL, + .inlen = 0, + .outlen = -EINVAL, + }, + { + .in = NULL, + .out = "", + .inlen = 1, + .outlen = -EINVAL, + }, + { + .in = "", + .out = NULL, + .inlen = 1, + .outlen = -EINVAL, + }, + { + .in = NULL, + .out = NULL, + .inlen = 1, + .outlen = -EINVAL, + }, + { + .in = "", + .out = "", + .inlen = 0, + .outlen = 0, + }, +}; + +static char out[1024 * 1024]; + +static void test_args(void) +{ + int i; + + for (i = 0; i < ARRAY_LEN(arg_tests); i++) { + const struct test *test = &arg_tests[i]; + ssize_t ret; + + fprintf(stderr, "%s: iter = %2d...", __func__, i); + + if (test->outlen > 0) + fail("expected outlen %zd > 0", test->outlen); + + ret = urldecode(test->in, test->inlen, (char *) test->out); + + if ((ret > 0) && !test->outlen) + fail("succeeded with %zd, should have succeeded with 0", + ret); + + if ((ret < 0) && !test->outlen) + fail("failed with '%s', should have succeded with 0", + xstrerror(ret)); + + if ((ret >= 0) && (test->outlen < 0)) + fail("succeeded with %zd, should have failed with %s", + ret, xstrerror(test->outlen)); + + if (ret != test->outlen) + fail("failed with '%s', should have failed with '%s'", + xstrerror(ret), xstrerror(test->outlen)); + + fprintf(stderr, "ok.\n"); + } +} + +static void test_inputs(void) +{ + int i; + + for (i = 0; i < ARRAY_LEN(input_tests); i++) { + const struct test *test = &input_tests[i]; + ssize_t outlen; + + fprintf(stderr, "%s: iter = %2d...", __func__, i); + + if ((test->outlen >= 0) && (sizeof(out) < test->outlen)) + fail("output buffer is too small; " + "need %zd bytes, got %zu", test->outlen, + sizeof(out)); + + outlen = urldecode(test->in, test->inlen, out); + + if ((outlen < 0) && (test->outlen >= 0)) + fail("urldecode failed with error: %s", + xstrerror(outlen)); + + if ((outlen >= 0) && (test->outlen < 0)) + fail("urldecode returned %zd, should have failed: %s", + outlen, xstrerror(test->outlen)); + + if ((outlen >= 0) && (outlen != test->outlen)) + fail("urldecode returned wrong number of bytes; " + "expected %zd, got %zd", test->outlen, outlen); + + if ((outlen < 0) && (outlen != test->outlen)) + fail("urldecode failed with wrong error; " + "expected '%s', got '%s'", xstrerror(test->outlen), + xstrerror(outlen)); + + if ((outlen >= 0) && memcmp(out, test->out, outlen)) + fail("output doesn't match expected string; " + "expected '%*.*s', got '%*.*s'", + outlen, outlen, test->out, + outlen, outlen, out); + + fprintf(stderr, "ok.\n"); + } +} + +void test(void) +{ + test_args(); + test_inputs(); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/urldecode.c Mon Jul 31 19:29:30 2017 +0300 @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2010-2017 Josef 'Jeff' Sipek <jeffpc@josefsipek.net> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +#include <jeffpc/urldecode.h> +#include <jeffpc/error.h> + +/* + * CGI 1.1 (RFC 3875) defines the QUERY_STRING variable to contain an + * URL-ecoded string. The specific flavor of encoding is based on the rules + * in RFC 2396 (obsoleted by RFC 3986) and the HTML 4.01 Specification + * (which defines the application/x-www-form-urlencoded MIME type) which + * references RFC 1738 (which has been updated by 3986). + * + * The HTML 4.01 spec describes the algorithm for encoding a form: + * + * 1. control names and values are escaped + * 2. spaces characters are replaced by '+' + * 3. reserved characters are escaped according to RFC 1738 section 2.2 + * 4. each name is separated from its value with a '=' + * 5. each name/value pair is separated with a '&' + * + * Additionally, the SCGI RFC tells us that the HTML forms consider '+', + * '&', and '=' as reserved. + * + * Note that we're not interesting in decoding the whole form blob but only + * already separated out names and values. Therefore, we should never + * encounter '&' or '=' in the input. (In other words, steps 4 and 5 have + * already be undone.) This leaves us with percent-encoding (per RFC 1738) + * and the '+' to space translation. + */ + +struct state { + enum decode_state { + DS_COPY, + DS_ESC1, + DS_ESC2, + } state; + + const char *in; + char *out; + size_t len; + size_t inoff; + size_t outoff; +}; + +static inline void append_char(struct state *state, char c) +{ + VERIFY3S(state->state, ==, DS_COPY); + VERIFY3U(c, !=, '+'); + VERIFY3U(c, !=, '%'); + + state->out[state->outoff] = c; + state->outoff++; +} + +static inline int append_esc(struct state *state, char c) +{ + if ((c >= '0') && (c <= '9')) + c -= '0'; + else if ((c >= 'a') && (c <= 'f')) + c -= 'a' - 10; + else if ((c >= 'A') && (c <= 'F')) + c -= 'A' - 10; + else + return -EILSEQ; + + switch (state->state) { + case DS_ESC1: + state->out[state->outoff] = c << 4; + state->state = DS_ESC2; + break; + case DS_ESC2: + state->out[state->outoff] |= c; + state->outoff++; + state->state = DS_COPY; + break; + default: + panic("illegal state when appending an escape"); + } + + return 0; +} + +/* + * Since urldecoding produces output that is <= the input length, the output + * buffer is assumed to be the same size as the input. + */ +ssize_t urldecode(const char *in, size_t len, char *out) +{ + struct state state; + int ret; + + if (!in || !out) + return -EINVAL; + + if (!len) + return 0; + + state.state = DS_COPY; + state.in = in; + state.out = out; + state.len = len; + state.inoff = 0; + state.outoff = 0; + + while (state.inoff < state.len) { + char c = state.in[state.inoff]; + + switch (state.state) { + case DS_COPY: + /* copy the char unless it is special */ + switch (c) { + case '%': + state.state = DS_ESC1; + break; + case '+': + append_char(&state, ' '); + break; + case '=': + case '&': + /* + * Even though we should + * never see these + * characters (as we're + * dealing with individual + * names/values that have + * been split up), we accept + * them as-is. + */ + default: + append_char(&state, c); + break; + } + + ret = 0; + break; + case DS_ESC1: + case DS_ESC2: + /* first/second char of an escape sequence */ + ret = append_esc(&state, c); + break; + } + + if (ret) + return ret; + + state.inoff++; + } + + return (state.state == DS_COPY) ? state.outoff : -EILSEQ; +}