dev-python/cchardet/files/cchardet-2.1.7-pytest.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120

https://github.com/PyYoshi/cChardet/pull/78

From: q0w <43147888+q0w@users.noreply.github.com>
Date: Wed, 17 Nov 2021 14:50:41 +0300
Subject: [PATCH 02/13] Use pytest

--- /dev/null
+++ b/src/tests/cchardet_test.py
@@ -0,0 +1,111 @@
+import glob
+import os
+
+import cchardet
+
+SKIP_LIST = [
+    'src/tests/testdata/ja/utf-16le.txt',
+    'src/tests/testdata/ja/utf-16be.txt',
+    'src/tests/testdata/es/iso-8859-15.txt',
+    'src/tests/testdata/da/iso-8859-1.txt',
+    'src/tests/testdata/he/iso-8859-8.txt'
+]
+
+# Python can't decode encoding
+SKIP_LIST_02 = [
+    'src/tests/testdata/vi/viscii.txt',
+    'src/tests/testdata/zh/euc-tw.txt'
+]
+SKIP_LIST_02.extend(SKIP_LIST)
+
+
+def test_ascii():
+    detected_encoding = cchardet.detect(b'abcdefghijklmnopqrstuvwxyz')
+    assert 'ascii' == detected_encoding['encoding'].lower()
+
+
+def test_detect():
+    testfiles = glob.glob('src/tests/testdata/*/*.txt')
+    for testfile in testfiles:
+        if testfile.replace("\\", "/") in SKIP_LIST:
+            continue
+
+        base = os.path.basename(testfile)
+        expected_charset = os.path.splitext(base)[0]
+        with open(testfile, 'rb') as f:
+            msg = f.read()
+            detected_encoding = cchardet.detect(msg)
+            assert expected_charset.lower() == detected_encoding['encoding'].lower()
+
+
+def test_detector():
+    detector = cchardet.UniversalDetector()
+    with open("src/tests/samples/wikipediaJa_One_Thousand_and_One_Nights_SJIS.txt", 'rb') as f:
+        line = f.readline()
+        while line:
+            detector.feed(line)
+            if detector.done:
+                break
+            line = f.readline()
+    detector.close()
+    detected_encoding = detector.result
+    assert "shift_jis" == detected_encoding['encoding'].lower()
+
+
+def test_github_issue_20():
+    """
+    https://github.com/PyYoshi/cChardet/issues/20
+    """
+    msg = b'\x8f'
+
+    cchardet.detect(msg)
+
+    detector = cchardet.UniversalDetector()
+    detector.feed(msg)
+    detector.close()
+
+
+def test_decode():
+    testfiles = glob.glob('src/tests/testdata/*/*.txt')
+    for testfile in testfiles:
+        if testfile.replace("\\", "/") in SKIP_LIST_02:
+            continue
+
+        base = os.path.basename(testfile)
+        expected_charset = os.path.splitext(base)[0]
+        with open(testfile, 'rb') as f:
+            msg = f.read()
+            detected_encoding = cchardet.detect(msg)
+            try:
+                msg.decode(detected_encoding["encoding"])
+            except LookupError as e:
+                print("LookupError: { file=%s, encoding=%s }" % (
+                    testfile, detected_encoding["encoding"]))
+                raise e
+
+
+def test_utf8_with_bom():
+    sample = b'\xEF\xBB\xBF'
+    detected_encoding = cchardet.detect(sample)
+    assert "utf-8-sig" == detected_encoding['encoding'].lower()
+
+
+def test_null_bytes():
+    sample = b'ABC\x00\x80\x81'
+    detected_encoding = cchardet.detect(sample)
+
+    assert detected_encoding['encoding'] is None
+
+# def test_iso8859_2_csv(self):
+#     testfile = 'tests/samples/iso8859-2.csv'
+#     with open(testfile, 'rb') as f:
+#         msg = f.read()
+#         detected_encoding = cchardet.detect(msg)
+#         eq_(
+#             "iso8859-2",
+#             detected_encoding['encoding'].lower(),
+#             'Expected %s, but got %s' % (
+#                 "iso8859-2",
+#                 detected_encoding['encoding'].lower()
+#             )
+#         )