Add more encodings

BlackthornYugen · BlackthornYugen · commit 66529448231d · 2024-06-18T13:04:09.000-04:00
diff --git a/httpbin/core.py b/httpbin/core.py
@@ -51,6 +51,7 @@
     parse_multi_value_header,
     next_stale_after_value,
     digest_challenge_response,
+    normalize_charset,
 )
 from .utils import weighted_choice
 from .structures import CaseInsensitiveDict
@@ -1407,20 +1408,96 @@ def cache_control(value):
     return response
 
 
-@app.route("/encoding/utf8")
-def encoding():
-    """Returns a UTF-8 encoded body.
+@app.route("/encoding/<charset>")
+def encoding(charset):
+    """Returns the requested charset and encoding.
     ---
     tags:
       - Response formats
+    parameters:
+      - in: path
+        name: charset
+        type:
+        default: 'utf8'
+      - in: query
+        name: content-type
+        type: string
+        description: The content type of the response. If unset will use response content type ("accept" header).
+        default: ''
+    produces:
+      - text/html
+      - text/plain
+      - '*/*'
+    responses:
+      200:
+        description: Content with the requested encoding and content type.
+    """
+    return encoding_generic(charset, None)
+
+
+@app.route("/encoding/<charset>/<body>")
+def encoding_generic(charset, body):
+    """Returns the requested charset and encoding.
+    ---
+    tags:
+      - Response formats
+    parameters:
+      - in: path
+        name: charset
+        type:
+        default: 'utf8'
+      - in: query
+        name: content-type
+        type: string
+        description: The content type of the response. If unset will use response content type ("accept" header).
+        default: ''
+      - in: path
+        name: body
+        type: string
+        default: SFRUUEJJTiDjga_mnIDpq5jjgafjgZk=
     produces:
       - text/html
+      - text/plain
+      - '*/*'
     responses:
       200:
-        description: Encoded UTF-8 content.
+        description: Content with the requested encoding and content type and body.
     """
+    response = make_response()
+
+    charset = charset or request.headers.get("accept-charset", "utf-8")
+    accept_header = request.headers.get("accept")
+    if accept_header is not None:
+        accept_header = accept_header.split(";")[0].split(",")[0]
+    response.content_type = (request.args.get("content-type", accept_header) or "text/html") + "; charset=" + charset
+    normalized_charset = (normalize_charset(charset) or "utf-8").lower()
 
-    return render_template("UTF-8-demo.txt")
+    if body:
+        response.data = base64.urlsafe_b64decode(body)
+        return response
+    elif normalized_charset in ["utf-8", "utf-16", "utf-32"]:
+        template_data = {
+            "title": "Unicode Demo",
+            "citation_url": "http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt",
+            "body_template": "encoding/utf-8.txt",
+            "citation_prefix": ("Taken from" if normalized_charset == "utf-8"
+                else f"Re-encoded to {normalized_charset} from the utf-8 taken from")
+        }
+    else:
+        template_data = {
+            "title": f"{normalized_charset} Demo",
+            "citation_url": "",
+            "body_template": f"encoding/{normalized_charset}.txt",
+            "citation_prefix": ""
+        }
+
+    if response.content_type.startswith("text/html"):
+        template_name = "encoding/demo.html.j2"
+    else:
+        template_name = template_data["body_template"]
+    response.data = render_template(template_name, **template_data).encode(normalized_charset)
+
+    return response
 
 
 @app.route("/bytes/<int:n>")
diff --git a/httpbin/helpers.py b/httpbin/helpers.py
@@ -483,3 +483,28 @@ def digest_challenge_response(app, qop, algorithm, stale = False):
     auth = WWWAuthenticate("digest", values=values)
     response.headers['WWW-Authenticate'] = auth.to_header()
     return response
+
+
+def normalize_charset(charset):
+    charset = charset.lower()
+    charset_aliases = {
+        "utf[-_]?8": "UTF-8",
+        "utf[-_]?16": "UTF-16",
+        "utf[-_]?32": "UTF-32",
+        "iso-ir-6|ansi_x3.4-1968|ansi_x3.4-1986|iso_646.irv:1991|ascii|iso646-us|us|csascii": "US-ASCII",
+        "iso[-_]?8859[-_]?2|iso-ir-101|csisolatin2|latin[-_]?2|l2|ibm912|cp912": "ISO-8859-2",
+        "iso[-_]?8859[-_]?3|iso-ir-109|csisolatin3|latin[-_]?3|l3|ibm913|cp913": "ISO-8859-3",
+        "iso[-_]?8859[-_]?4|iso-ir-110|csisolatin4|latin[-_]?4|l4|ibm914|cp914": "ISO-8859-4",
+        "iso[-_]?8859[-_]?1?|iso-ir-100|csisolatin1|latin[-_]?1|l1|ibm819|cp819": "ISO-8859-1",
+        "big5|csbig5|cn-big5": "Big5",
+        "gb2312|csgb2312|chinese": "GB2312",
+        "euc-jp|.*japanese": "EUC-JP",
+        "shift_jis|csshiftjis|ms_kanji|x-sjis": "Shift_JIS",
+        "windows-1252|windows1252|cp1252|ms-ee": "Windows-1252",
+    }
+
+    for pattern, normalized in charset_aliases.items():
+        if re.match(pattern, charset):
+            return normalized
+
+    return None
diff --git a/httpbin/templates/encoding/big5.txt b/httpbin/templates/encoding/big5.txt
@@ -0,0 +1,6 @@
+這是一個中文文本的範例。這段文本是以Big5編碼的。Big5是用來處理中文字符的編碼之一。以下是一段中文的文章。
+
+中文中有很多不同的字符，包括漢字、標點符號等。漢字是從古代流傳下來的字符，每個字符都有其獨特的意義。例如，“中文”這個詞是由兩個漢字組成的：“中”和“文”。
+
+今天的天氣非常好。天空湛藍，微風習習。公園裡有許多人在散步。孩子們在遊樂場玩耍，大人們坐在長椅上聊天。很多人帶著狗在公園裡散步。在自然環境中度過時光，讓人感到非常放鬆。
+
diff --git a/httpbin/templates/encoding/demo.html.j2 b/httpbin/templates/encoding/demo.html.j2
@@ -0,0 +1,8 @@
+<h1>{{ title }}</h1>
+
+<p>{{ citation_prefix }} <a
+href="{{ citation_url }}">{{ citation_url }}</a></p>
+
+<pre>
+{% include body_template ignore missing %}
+</pre>
diff --git a/httpbin/templates/encoding/euc-jp.txt b/httpbin/templates/encoding/euc-jp.txt
@@ -0,0 +1,6 @@
+これは日本語のテキストの例です。このテキストは、EUC-JPエンコーディングで符号化されています。EUC-JPは、日本語をコンピュータで扱うための文字エンコーディングの一つです。以下に、日本語の文章を続けます。
+
+日本語には、漢字、ひらがな、カタカナの三種類の文字があります。漢字は中国から伝わった文字で、意味を持つ文字です。ひらがなとカタカナは、日本独自の音節文字で、発音を表します。例えば、「日本語」という単語は、漢字で「日本」と書き、ひらがなで「ご」と書きます。
+
+今日の天気は晴れです。青い空が広がっており、風も心地よいです。公園では、多くの人々が散歩を楽しんでいます。子供たちは遊具で遊び、大人たちはベンチに座って話をしています。犬を連れた人も多く見かけます。自然の中で過ごす時間は、とてもリフレッシュできます。
+
diff --git a/httpbin/templates/encoding/gb2312.txt b/httpbin/templates/encoding/gb2312.txt
@@ -0,0 +1,6 @@
+这是一个中文文本的范例。这段文本是以GB2312编码的。GB2312是用来处理中文字符的编码之一。以下是一段中文的文章。
+
+中文中有很多不同的字符，包括汉字、标点符号等。汉字是从古代流传下来的字符，每个字符都有其独特的意义。例如，“中文”这个词是由两个汉字组成的：“中”和“文”。
+
+今天的天气非常好。天空湛蓝，微风习习。公园里有许多人在散步。孩子们在游乐场玩耍，大人们坐在长椅上聊天。很多人带着狗在公园里散步。在自然环境中度过时光，让人感到非常放松。
+
diff --git a/httpbin/templates/encoding/iso-8859-1.txt b/httpbin/templates/encoding/iso-8859-1.txt
@@ -0,0 +1,15 @@
+https://en.wikipedia.org/wiki/ISO/IEC_8859-1
+
+    !   "   #   $   %   &   '   (   )   *   +   ,   -   .  /
+0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >  ?
+@   A   B   C   D   E   F   G   H   I   J   K   L   M   N  O
+P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^  _
+`   a   b   c   d   e   f   g   h   i   j   k   l   m   n  o
+p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~
+NBSP¡   ¢   £   ¤   ¥   ¦   §   ¨   ©   ª   «   ¬  SHY  ®  ¯
+°   ±   ²   ³   ´   µ   ¶   ·   ¸   ¹   º   »   ¼   ½   ¾  ¿
+À   Á   Â   Ã   Ä   Å   Æ   Ç   È   É   Ê   Ë   Ì   Í   Î  Ï
+Ð   Ñ   Ò   Ó   Ô   Õ   Ö   ×   Ø   Ù   Ú   Û   Ü   Ý   Þ  ß
+à   á   â   ã   ä   å   æ   ç   è   é   ê   ë   ì   í   î  ï
+ð   ñ   ò   ó   ô   õ   ö   ÷   ø   ù   ú   û   ü   ý   þ  ÿ
+
diff --git a/httpbin/templates/encoding/iso-8859-2.txt b/httpbin/templates/encoding/iso-8859-2.txt
@@ -0,0 +1,15 @@
+https://en.wikipedia.org/wiki/ISO/IEC_8859-2
+
+    !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /
+0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?
+@   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O
+P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _
+`   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o
+p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~
+NBSPĄ   ˘   Ł   ¤   Ľ   Ś   §   ¨   Š   Ş   Ť   Ź  SHY  Ž   Ż
+°   ą   ˛   ł   ´   ľ   ś   ˇ   ¸   š   ş   ť   ź   ˝   ž   ż
+Ŕ   Á   Â   Ă   Ä   Ĺ   Ć   Ç   Č   É   Ę   Ë   Ě   Í   Î   Ď
+Đ   Ń   Ň   Ó   Ô   Ő   Ö   ×   Ř   Ů   Ú   Ű   Ü   Ý   Ţ   ß
+ŕ   á   â   ă   ä   ĺ   ć   ç   č   é   ę   ë   ě   í   î   ď
+đ   ń   ň   ó   ô   ő   ö   ÷   ř   ů   ú   ű   ü   ý   ţ   ˙
+
diff --git a/httpbin/templates/encoding/iso-8859-3.txt b/httpbin/templates/encoding/iso-8859-3.txt
@@ -0,0 +1,15 @@
+https://en.wikipedia.org/wiki/ISO/IEC_8859-3
+
+    !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /
+0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?
+@   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O
+P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _
+`   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o
+p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~
+NBSPĦ   ˘   £   ¤       Ĥ   §   ¨   İ   Ş   Ğ   Ĵ  SHY      Ż
+°   ħ   ²   ³   ´   µ   ĥ   ·   ¸   ı   ş   ğ   ĵ   ½       ż
+À   Á   Â       Ä   Ċ   Ĉ   Ç   È   É   Ê   Ë   Ì   Í   Î   Ï
+    Ñ   Ò   Ó   Ô   Ġ   Ö   ×   Ĝ   Ù   Ú   Û   Ü   Ŭ   Ŝ   ß
+à   á   â       ä   ċ   ĉ   ç   è   é   ê   ë   ì   í   î   ï
+    ñ   ò   ó   ô   ġ   ö   ÷   ĝ   ù   ú   û   ü   ŭ   ŝ   ˙
+
diff --git a/httpbin/templates/encoding/iso-8859-4.txt b/httpbin/templates/encoding/iso-8859-4.txt
@@ -0,0 +1,15 @@
+https://en.wikipedia.org/wiki/ISO/IEC_8859-4
+
+    !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /
+0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?
+@   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O
+P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _
+`   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o
+p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~
+NBSPĄ   ĸ   Ŗ   ¤   Ĩ   Ļ   §   ¨   Š   Ē   Ģ   Ŧ   Ž  SPH  ¯
+°   ą   ˛   ŗ   ´   ĩ   ļ   ˇ   ¸   š   ē   ģ   ŧ   Ŋ   ž   ŋ
+Ā   Á   Â   Ã   Ä   Å   Æ   Į   Č   É   Ę   Ë   Ė   Í   Î   Ī
+Đ   Ņ   Ō   Ķ   Ô   Õ   Ö   ×   Ø   Ų   Ú   Û   Ü   Ũ   Ū   ß
+ā   á   â   ã   ä   å   æ   į   č   é   ę   ë   ė   í   î   ī
+đ   ņ   ō   ķ   ô   õ   ö   ÷   ø   ų   ú   û   ü   ũ   ū   ˙
+
diff --git a/httpbin/templates/encoding/shift_jis.txt b/httpbin/templates/encoding/shift_jis.txt
@@ -0,0 +1,6 @@
+これは日本語のテキストの例です。このテキストは、Shift_JISエンコーディングで符号化されています。Shift_JISは、日本語をコンピュータで扱うための文字エンコーディングの一つです。以下に、日本語の文章を続けます。
+
+日本語には、漢字、ひらがな、カタカナの三種類の文字があります。漢字は中国から伝わった文字で、意味を持つ文字です。ひらがなとカタカナは、日本独自の音節文字で、発音を表します。例えば、「日本語」という単語は、漢字で「日本」と書き、ひらがなで「ご」と書きます。
+
+今日の天気は晴れです。青い空が広がっており、風も心地よいです。公園では、多くの人々が散歩を楽しんでいます。子供たちは遊具で遊び、大人たちはベンチに座って話をしています。犬を連れた人も多く見かけます。自然の中で過ごす時間は、とてもリフレッシュできます。
+
diff --git a/httpbin/templates/encoding/utf-8.txt b/httpbin/templates/encoding/utf-8.txt
@@ -1,9 +1,3 @@
-<h1>Unicode Demo</h1>
-
-<p>Taken from <a
-href="http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt">http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt</a></p>
-
-<pre>
 
 UTF-8 encoded sample plain-text file
 ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
@@ -217,4 +211,3 @@ Box drawing alignment tests:                                          █
   ╚══╩══╝  └──┴──┘  ╰──┴──╯  ╰──┴──╯  ┗━━┻━━┛  ▗▄▖▛▀▜   └╌╌┘ ╎ ┗╍╍┛ ┋  ▁▂▃▄▅▆▇█
                                                ▝▀▘▙▄▟
 
-</pre>
diff --git a/httpbin/templates/encoding/windows-1252.txt b/httpbin/templates/encoding/windows-1252.txt
@@ -0,0 +1,16 @@
+https://en.wikipedia.org/wiki/Windows-1252
+
+    !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /
+0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?
+@   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O
+P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _
+`   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o
+p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~
+€       ‚   ƒ   „   …   †   ‡   ˆ   ‰   Š   ‹   Œ           Ž
+NBSP¡   ¢   £   ¤   ¥   ¦   §   ¨   ©   ª   «   ¬  SHY  ®   ¯
+°   ±   ²   ³   ´   µ   ¶   ·   ¸   ¹   º   »   ¼   ½   ¾   ¿
+À   Á   Â   Ã   Ä   Å   Æ   Ç   È   É   Ê   Ë   Ì   Í   Î   Ï
+Ð   Ñ   Ò   Ó   Ô   Õ   Ö   ×   Ø   Ù   Ú   Û   Ü   Ý   Þ   ß
+à   á   â   ã   ä   å   æ   ç   è   é   ê   ë   ì   í   î   ï
+ð   ñ   ò   ó   ô   õ   ö   ÷   ø   ù   ú   û   ü   ý   þ   ÿ
+
diff --git a/tests/test_httpbin.py b/tests/test_httpbin.py
@@ -11,6 +11,7 @@
 
 import httpbin
 from httpbin.helpers import parse_multi_value_header
+from httpbin.helpers import normalize_charset
 
 
 @contextlib.contextmanager
@@ -790,3 +791,113 @@ def test_parse_multi_value_header(self):
         self.assertEqual(parse_multi_value_header('"xyzzy", "r2d2xxxx", "c3piozzzz"'), [ "xyzzy", "r2d2xxxx", "c3piozzzz" ])
         self.assertEqual(parse_multi_value_header('W/"xyzzy", W/"r2d2xxxx", W/"c3piozzzz"'), [ "xyzzy", "r2d2xxxx", "c3piozzzz" ])
         self.assertEqual(parse_multi_value_header('*'), [ "*" ])
+
+    def test_encoding_endpoint(self):
+        codec = 'utf-8'
+        response = self.app.get(f'/encoding/{codec}')
+
+        # Check that the request was successful.
+        self.assertEqual(response.status_code, 200)
+
+        # Check that the response headers indicate the correct content type.
+        self.assertEqual(response.headers['Content-Type'], 'text/html; charset=utf-8')
+
+        # Check that the response body is not empty.
+        self.assertTrue(len(response.text) > 0)
+
+    def test_encoding_endpoint_iso(self):
+        response = self.app.get('/encoding/ISO-8859-1?content-type=application/json')
+
+        # Check that the request was successful.
+        self.assertEqual(response.status_code, 200)
+
+        # Check that the response headers indicate the correct content type.
+        self.assertEqual("application/json; charset=ISO-8859-1", response.headers['Content-Type'])
+
+        # Check that the response body is not empty.
+        self.assertTrue(len(response.data) > 0)
+
+    def test_swagger_spec(self):
+        response = self.app.get('/spec.json')
+
+        # Check that the request was successful.
+        self.assertEqual(response.status_code, 200)
+
+        # Check that the response body is not empty.
+        self.assertTrue(len(response.text) > 0)
+
+    def test_normalize_charset(self):
+        test_cases = [
+            ("UTF8", "UTF-8"),
+            ("utf-8", "UTF-8"),
+            ("UTF16", "UTF-16"),
+            ("utf-16", "UTF-16"),
+            ("UTF32", "UTF-32"),
+            ("utf-32", "UTF-32"),
+            ("utf-64", None),
+            ("iso-ir-100", "ISO-8859-1"),
+            ("csISOLatin1", "ISO-8859-1"),
+            ("latin1", "ISO-8859-1"),
+            ("l1", "ISO-8859-1"),
+            ("IBM819", "ISO-8859-1"),
+            ("CP819", "ISO-8859-1"),
+            ("iso-ir-6", "US-ASCII"),
+            ("ANSI_X3.4-1968", "US-ASCII"),
+            ("ANSI_X3.4-1986", "US-ASCII"),
+            ("ISO_646.irv:1991", "US-ASCII"),
+            ("ASCII", "US-ASCII"),
+            ("ISO646-US", "US-ASCII"),
+            ("us", "US-ASCII"),
+            ("csASCII", "US-ASCII"),
+            ("ISO-8859-1", "ISO-8859-1"),
+            ("iso8859-1", "ISO-8859-1"),
+            ("iso88591", "ISO-8859-1"),
+            ("latin1", "ISO-8859-1"),
+            ("latin-1", "ISO-8859-1"),
+            ("ISO_8859-1:1987", "ISO-8859-1"),
+            ("ISO_8859-1", "ISO-8859-1"),
+            ("ISO-8859-2", "ISO-8859-2"),
+            ("iso-ir-101", "ISO-8859-2"),
+            ("csISOLatin2", "ISO-8859-2"),
+            ("latin2", "ISO-8859-2"),
+            ("l2", "ISO-8859-2"),
+            ("IBM912", "ISO-8859-2"),
+            ("CP912", "ISO-8859-2"),
+            ("ISO-8859-3", "ISO-8859-3"),
+            ("iso-ir-109", "ISO-8859-3"),
+            ("csISOLatin3", "ISO-8859-3"),
+            ("latin3", "ISO-8859-3"),
+            ("l3", "ISO-8859-3"),
+            ("IBM913", "ISO-8859-3"),
+            ("CP913", "ISO-8859-3"),
+            ("ISO-8859-4", "ISO-8859-4"),
+            ("iso-ir-110", "ISO-8859-4"),
+            ("csISOLatin4", "ISO-8859-4"),
+            ("latin4", "ISO-8859-4"),
+            ("l4", "ISO-8859-4"),
+            ("IBM914", "ISO-8859-4"),
+            ("CP914", "ISO-8859-4"),
+            ("big5", "Big5"),
+            ("csbig5", "Big5"),
+            ("cn-big5", "Big5"),
+            ("euc-jp", "EUC-JP"),
+            ("japanese", "EUC-JP"),
+            ("cseucpkdfmtjapanese", "EUC-JP"),
+            ("extended_unix_code_packed_format_for_japanese", "EUC-JP"),
+            ("shift_jis", "Shift_JIS"),
+            ("csshiftjis", "Shift_JIS"),
+            ("ms_kanji", "Shift_JIS"),
+            ("x-sjis", "Shift_JIS"),
+            ("gb2312", "GB2312"),
+            ("csGB2312", "GB2312"),
+            ("chinese", "GB2312"),
+            ("windows-1252", "Windows-1252"),
+            ("windows1252", "Windows-1252"),
+            ("cp1252", "Windows-1252"),
+            ("ms-ee", "Windows-1252"),
+            ("unknown-charset", None),
+        ]
+
+        for charset, expected in test_cases:
+            with self.subTest(charset=charset):
+                self.assertEqual(normalize_charset(charset), expected)