changeset 296:1f6ae020116d

urldecode: decode url-encoded strings Signed-off-by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
author Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
date Mon, 31 Jul 2017 19:29:30 +0300
parents 842d49f888f3
children d7967afe6c56
files .hgignore CMakeLists.txt include/jeffpc/urldecode.h jeffpc.mapfile-vers test_urldecode.c urldecode.c
diffstat 6 files changed, 492 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/.hgignore	Sat Jul 29 13:54:07 2017 +0300
+++ b/.hgignore	Mon Jul 31 19:29:30 2017 +0300
@@ -30,6 +30,7 @@
 test_padding
 test_sexpr_parser
 test_sexpr_eval
+test_urldecode
 test_version
 
 hgversion.h
--- a/CMakeLists.txt	Sat Jul 29 13:54:07 2017 +0300
+++ b/CMakeLists.txt	Mon Jul 31 19:29:30 2017 +0300
@@ -109,6 +109,7 @@
 	str.c
 	synch.c
 	taskq.c
+	urldecode.c
 	uuid.c
 	val.c
 	version.c
@@ -162,6 +163,7 @@
 		include/jeffpc/thread.h
 		include/jeffpc/time.h
 		include/jeffpc/types.h
+		include/jeffpc/urldecode.h
 		include/jeffpc/uuid.h
 		include/jeffpc/val.h
 		include/jeffpc/version.h
@@ -212,6 +214,7 @@
 build_test_bin_and_run(nvl_pack)
 build_test_bin_and_run(padding)
 build_test_bin_and_run(sexpr_eval)
+build_test_bin_and_run(urldecode)
 build_test_bin_and_run(version)
 
 add_subdirectory(tests)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/jeffpc/urldecode.h	Mon Jul 31 19:29:30 2017 +0300
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2010-2017 Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __JEFFPC_URLDECODE_H
+#define __JEFFPC_URLDECODE_H
+
+extern ssize_t urldecode(const char *in, size_t len, char *out);
+
+#endif
--- a/jeffpc.mapfile-vers	Sat Jul 29 13:54:07 2017 +0300
+++ b/jeffpc.mapfile-vers	Mon Jul 31 19:29:30 2017 +0300
@@ -200,6 +200,9 @@
 		val_alloc_str;
 		val_alloc_sym;
 
+		# urldecode
+		urldecode;
+
 		# version
 		jeffpc_hgrev;
 		jeffpc_hgrev_binary;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test_urldecode.c	Mon Jul 31 19:29:30 2017 +0300
@@ -0,0 +1,283 @@
+/*
+ * Copyright (c) 2017 Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <jeffpc/types.h>
+#include <jeffpc/error.h>
+#include <jeffpc/urldecode.h>
+
+#include "test.c"
+
+struct test {
+	const char *in;
+	const char *out;
+	ssize_t inlen;
+	ssize_t outlen;
+};
+
+static const struct test input_tests[] = {
+	{
+		.in = NULL,
+		.inlen = 1,
+		.outlen = -EINVAL,
+	},
+	{
+		.in = "",
+		.out = "",
+		.inlen = 0,
+		.outlen = 0,
+	},
+	{
+		.in = "a",
+		.out = "a",
+		.inlen = 1,
+		.outlen = 1,
+	},
+	{
+		.in = "ab",
+		.out = "a",
+		.inlen = 1,
+		.outlen = 1,
+	},
+	{
+		.in = "+",
+		.out = " ",
+		.inlen = 1,
+		.outlen = 1,
+	},
+	{
+		.in = "%20",
+		.out = " ",
+		.inlen = 3,
+		.outlen = 1,
+	},
+	{
+		.in = "%",
+		.inlen = 1,
+		.outlen = -EILSEQ,
+	},
+	{
+		.in = "%0",
+		.inlen = 1,
+		.outlen = -EILSEQ,
+	},
+	{
+		.in = "%0",
+		.inlen = 2,
+		.outlen = -EILSEQ,
+	},
+	{
+		.in = "%00",
+		.out = "\0",
+		.inlen = 3,
+		.outlen = 1,
+	},
+	{
+		.in = "abcdefghijklmnopqrstuvwxyz"
+		      "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+		      "0123456789",
+		.out = "abcdefghijklmnopqrstuvwxyz"
+		       "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+		       "0123456789",
+		.inlen = 26 + 26 + 10,
+		.outlen = 26 + 26 + 10,
+	},
+	{
+		.in = "abc+def",
+		.out = "abc def",
+		.inlen = 7,
+		.outlen = 7,
+	},
+	{
+		.in = "abc=def",
+		.out = "abc=def",
+		.inlen = 7,
+		.outlen = 7,
+	},
+	{
+		.in = "abc&def",
+		.out = "abc&def",
+		.inlen = 7,
+		.outlen = 7,
+	},
+	{
+		.in = "abc%def",
+		.out = "abc\xde""f",
+		.inlen = 7,
+		.outlen = 5,
+	},
+	{
+		.in = "abc%DEf",
+		.out = "abc\xde""f",
+		.inlen = 7,
+		.outlen = 5,
+	},
+	{
+		.in = "abc%a0f",
+		.out = "abc\xa0""f",
+		.inlen = 7,
+		.outlen = 5,
+	},
+	{
+		.in = "abc%A0f",
+		.out = "abc\xa0""f",
+		.inlen = 7,
+		.outlen = 5,
+	},
+	{
+		.in = "abc%88f",
+		.out = "abc\x88""f",
+		.inlen = 7,
+		.outlen = 5,
+	},
+};
+
+static const struct test arg_tests[] = {
+	{
+		.in = NULL,
+		.out = "",
+		.inlen = 0,
+		.outlen = -EINVAL,
+	},
+	{
+		.in = "",
+		.out = NULL,
+		.inlen = 0,
+		.outlen = -EINVAL,
+	},
+	{
+		.in = NULL,
+		.out = NULL,
+		.inlen = 0,
+		.outlen = -EINVAL,
+	},
+	{
+		.in = NULL,
+		.out = "",
+		.inlen = 1,
+		.outlen = -EINVAL,
+	},
+	{
+		.in = "",
+		.out = NULL,
+		.inlen = 1,
+		.outlen = -EINVAL,
+	},
+	{
+		.in = NULL,
+		.out = NULL,
+		.inlen = 1,
+		.outlen = -EINVAL,
+	},
+	{
+		.in = "",
+		.out = "",
+		.inlen = 0,
+		.outlen = 0,
+	},
+};
+
+static char out[1024 * 1024];
+
+static void test_args(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_LEN(arg_tests); i++) {
+		const struct test *test = &arg_tests[i];
+		ssize_t ret;
+
+		fprintf(stderr, "%s: iter = %2d...", __func__, i);
+
+		if (test->outlen > 0)
+			fail("expected outlen %zd > 0", test->outlen);
+
+		ret = urldecode(test->in, test->inlen, (char *) test->out);
+
+		if ((ret > 0) && !test->outlen)
+			fail("succeeded with %zd, should have succeeded with 0",
+			     ret);
+
+		if ((ret < 0) && !test->outlen)
+			fail("failed with '%s', should have succeded with 0",
+			     xstrerror(ret));
+
+		if ((ret >= 0) && (test->outlen < 0))
+			fail("succeeded with %zd, should have failed with %s",
+			     ret, xstrerror(test->outlen));
+
+		if (ret != test->outlen)
+			fail("failed with '%s', should have failed with '%s'",
+			     xstrerror(ret), xstrerror(test->outlen));
+
+		fprintf(stderr, "ok.\n");
+	}
+}
+
+static void test_inputs(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_LEN(input_tests); i++) {
+		const struct test *test = &input_tests[i];
+		ssize_t outlen;
+
+		fprintf(stderr, "%s: iter = %2d...", __func__, i);
+
+		if ((test->outlen >= 0) && (sizeof(out) < test->outlen))
+			fail("output buffer is too small; "
+			     "need %zd bytes, got %zu", test->outlen,
+			     sizeof(out));
+
+		outlen = urldecode(test->in, test->inlen, out);
+
+		if ((outlen < 0) && (test->outlen >= 0))
+			fail("urldecode failed with error: %s",
+			     xstrerror(outlen));
+
+		if ((outlen >= 0) && (test->outlen < 0))
+			fail("urldecode returned %zd, should have failed: %s",
+			     outlen, xstrerror(test->outlen));
+
+		if ((outlen >= 0) && (outlen != test->outlen))
+			fail("urldecode returned wrong number of bytes; "
+			     "expected %zd, got %zd", test->outlen, outlen);
+
+		if ((outlen < 0) && (outlen != test->outlen))
+			fail("urldecode failed with wrong error; "
+			     "expected '%s', got '%s'", xstrerror(test->outlen),
+			     xstrerror(outlen));
+
+		if ((outlen >= 0) && memcmp(out, test->out, outlen))
+			fail("output doesn't match expected string; "
+			     "expected '%*.*s', got '%*.*s'",
+			     outlen, outlen, test->out,
+			     outlen, outlen, out);
+
+		fprintf(stderr, "ok.\n");
+	}
+}
+
+void test(void)
+{
+	test_args();
+	test_inputs();
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/urldecode.c	Mon Jul 31 19:29:30 2017 +0300
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2010-2017 Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <jeffpc/urldecode.h>
+#include <jeffpc/error.h>
+
+/*
+ * CGI 1.1 (RFC 3875) defines the QUERY_STRING variable to contain an
+ * URL-ecoded string.  The specific flavor of encoding is based on the rules
+ * in RFC 2396 (obsoleted by RFC 3986) and the HTML 4.01 Specification
+ * (which defines the application/x-www-form-urlencoded MIME type) which
+ * references RFC 1738 (which has been updated by 3986).
+ *
+ * The HTML 4.01 spec describes the algorithm for encoding a form:
+ *
+ *  1. control names and values are escaped
+ *  2. spaces characters are replaced by '+'
+ *  3. reserved characters are escaped according to RFC 1738 section 2.2
+ *  4. each name is separated from its value with a '='
+ *  5. each name/value pair is separated with a '&'
+ *
+ * Additionally, the SCGI RFC tells us that the HTML forms consider '+',
+ * '&', and '=' as reserved.
+ *
+ * Note that we're not interesting in decoding the whole form blob but only
+ * already separated out names and values.  Therefore, we should never
+ * encounter '&' or '=' in the input.  (In other words, steps 4 and 5 have
+ * already be undone.)  This leaves us with percent-encoding (per RFC 1738)
+ * and the '+' to space translation.
+ */
+
+struct state {
+	enum decode_state {
+		DS_COPY,
+		DS_ESC1,
+		DS_ESC2,
+	} state;
+
+	const char *in;
+	char *out;
+	size_t len;
+	size_t inoff;
+	size_t outoff;
+};
+
+static inline void append_char(struct state *state, char c)
+{
+	VERIFY3S(state->state, ==, DS_COPY);
+	VERIFY3U(c, !=, '+');
+	VERIFY3U(c, !=, '%');
+
+	state->out[state->outoff] = c;
+	state->outoff++;
+}
+
+static inline int append_esc(struct state *state, char c)
+{
+	if ((c >= '0') && (c <= '9'))
+		c -= '0';
+	else if ((c >= 'a') && (c <= 'f'))
+		c -= 'a' - 10;
+	else if ((c >= 'A') && (c <= 'F'))
+		c -= 'A' - 10;
+	else
+		return -EILSEQ;
+
+	switch (state->state) {
+		case DS_ESC1:
+			state->out[state->outoff] = c << 4;
+			state->state = DS_ESC2;
+			break;
+		case DS_ESC2:
+			state->out[state->outoff] |= c;
+			state->outoff++;
+			state->state = DS_COPY;
+			break;
+		default:
+			panic("illegal state when appending an escape");
+	}
+
+	return 0;
+}
+
+/*
+ * Since urldecoding produces output that is <= the input length, the output
+ * buffer is assumed to be the same size as the input.
+ */
+ssize_t urldecode(const char *in, size_t len, char *out)
+{
+	struct state state;
+	int ret;
+
+	if (!in || !out)
+		return -EINVAL;
+
+	if (!len)
+		return 0;
+
+	state.state = DS_COPY;
+	state.in = in;
+	state.out = out;
+	state.len = len;
+	state.inoff = 0;
+	state.outoff = 0;
+
+	while (state.inoff < state.len) {
+		char c = state.in[state.inoff];
+
+		switch (state.state) {
+			case DS_COPY:
+				/* copy the char unless it is special */
+				switch (c) {
+					case '%':
+						state.state = DS_ESC1;
+						break;
+					case '+':
+						append_char(&state, ' ');
+						break;
+					case '=':
+					case '&':
+						/*
+						 * Even though we should
+						 * never see these
+						 * characters (as we're
+						 * dealing with individual
+						 * names/values that have
+						 * been split up), we accept
+						 * them as-is.
+						 */
+					default:
+						append_char(&state, c);
+						break;
+				}
+
+				ret = 0;
+				break;
+			case DS_ESC1:
+			case DS_ESC2:
+				/* first/second char of an escape sequence */
+				ret = append_esc(&state, c);
+				break;
+		}
+
+		if (ret)
+			return ret;
+
+		state.inoff++;
+	}
+
+	return (state.state == DS_COPY) ? state.outoff : -EILSEQ;
+}