1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
|
https://github.com/PyYoshi/cChardet/pull/78
From: q0w <43147888+q0w@users.noreply.github.com>
Date: Wed, 17 Nov 2021 14:50:41 +0300
Subject: [PATCH 02/13] Use pytest
--- /dev/null
+++ b/src/tests/cchardet_test.py
@@ -0,0 +1,111 @@
+import glob
+import os
+
+import cchardet
+
+SKIP_LIST = [
+ 'src/tests/testdata/ja/utf-16le.txt',
+ 'src/tests/testdata/ja/utf-16be.txt',
+ 'src/tests/testdata/es/iso-8859-15.txt',
+ 'src/tests/testdata/da/iso-8859-1.txt',
+ 'src/tests/testdata/he/iso-8859-8.txt'
+]
+
+# Python can't decode encoding
+SKIP_LIST_02 = [
+ 'src/tests/testdata/vi/viscii.txt',
+ 'src/tests/testdata/zh/euc-tw.txt'
+]
+SKIP_LIST_02.extend(SKIP_LIST)
+
+
+def test_ascii():
+ detected_encoding = cchardet.detect(b'abcdefghijklmnopqrstuvwxyz')
+ assert 'ascii' == detected_encoding['encoding'].lower()
+
+
+def test_detect():
+ testfiles = glob.glob('src/tests/testdata/*/*.txt')
+ for testfile in testfiles:
+ if testfile.replace("\\", "/") in SKIP_LIST:
+ continue
+
+ base = os.path.basename(testfile)
+ expected_charset = os.path.splitext(base)[0]
+ with open(testfile, 'rb') as f:
+ msg = f.read()
+ detected_encoding = cchardet.detect(msg)
+ assert expected_charset.lower() == detected_encoding['encoding'].lower()
+
+
+def test_detector():
+ detector = cchardet.UniversalDetector()
+ with open("src/tests/samples/wikipediaJa_One_Thousand_and_One_Nights_SJIS.txt", 'rb') as f:
+ line = f.readline()
+ while line:
+ detector.feed(line)
+ if detector.done:
+ break
+ line = f.readline()
+ detector.close()
+ detected_encoding = detector.result
+ assert "shift_jis" == detected_encoding['encoding'].lower()
+
+
+def test_github_issue_20():
+ """
+ https://github.com/PyYoshi/cChardet/issues/20
+ """
+ msg = b'\x8f'
+
+ cchardet.detect(msg)
+
+ detector = cchardet.UniversalDetector()
+ detector.feed(msg)
+ detector.close()
+
+
+def test_decode():
+ testfiles = glob.glob('src/tests/testdata/*/*.txt')
+ for testfile in testfiles:
+ if testfile.replace("\\", "/") in SKIP_LIST_02:
+ continue
+
+ base = os.path.basename(testfile)
+ expected_charset = os.path.splitext(base)[0]
+ with open(testfile, 'rb') as f:
+ msg = f.read()
+ detected_encoding = cchardet.detect(msg)
+ try:
+ msg.decode(detected_encoding["encoding"])
+ except LookupError as e:
+ print("LookupError: { file=%s, encoding=%s }" % (
+ testfile, detected_encoding["encoding"]))
+ raise e
+
+
+def test_utf8_with_bom():
+ sample = b'\xEF\xBB\xBF'
+ detected_encoding = cchardet.detect(sample)
+ assert "utf-8-sig" == detected_encoding['encoding'].lower()
+
+
+def test_null_bytes():
+ sample = b'ABC\x00\x80\x81'
+ detected_encoding = cchardet.detect(sample)
+
+ assert detected_encoding['encoding'] is None
+
+# def test_iso8859_2_csv(self):
+# testfile = 'tests/samples/iso8859-2.csv'
+# with open(testfile, 'rb') as f:
+# msg = f.read()
+# detected_encoding = cchardet.detect(msg)
+# eq_(
+# "iso8859-2",
+# detected_encoding['encoding'].lower(),
+# 'Expected %s, but got %s' % (
+# "iso8859-2",
+# detected_encoding['encoding'].lower()
+# )
+# )
|