summaryrefslogtreecommitdiff
path: root/sci-libs/datasets
diff options
context:
space:
mode:
authorV3n3RiX <venerix@koprulu.sector>2024-02-21 23:40:13 +0000
committerV3n3RiX <venerix@koprulu.sector>2024-02-21 23:40:13 +0000
commit51012ef0261a9da2e4487144906632d06607d3b9 (patch)
tree4bdd7443f88209d056985f2e4748277714e35829 /sci-libs/datasets
parentf2d1966a40070fb5ba3279db49b8435c7c143802 (diff)
gentoo auto-resync : 21:02:2024 - 23:40:13
Diffstat (limited to 'sci-libs/datasets')
-rw-r--r--sci-libs/datasets/Manifest4
-rw-r--r--sci-libs/datasets/datasets-2.16.0.ebuild14
-rw-r--r--sci-libs/datasets/files/datasets-2.16.0-tests.patch160
3 files changed, 118 insertions, 60 deletions
diff --git a/sci-libs/datasets/Manifest b/sci-libs/datasets/Manifest
index 968588f01e46..21c1c02b3ecd 100644
--- a/sci-libs/datasets/Manifest
+++ b/sci-libs/datasets/Manifest
@@ -1,5 +1,5 @@
AUX datasets-2.14.4-tests.patch 8616 BLAKE2B 8a65d1315b27658a5f741ebc022c83692252a4833ec4d7b79873799c2bb4bb68534a9e13e7fae1c9a6c051b3615fbb783e6e7885ed93968b31aea6629b4116c4 SHA512 2c6d27c297995466a0aebefa46e86113bdce7d84ea00bb1630549fc379fbb51d66f8f01a8d098d56ec2b26d5200f129460567abdbf6a63d4e2a61372fbfbc6a3
-AUX datasets-2.16.0-tests.patch 4104 BLAKE2B d58e08650467eaf589c72f4270980436c3a7ae72faf6ce433d0b3f4f136367b52041f80e51c6b141b655b3789111e60025d512a01d497c402bf7c74309370287 SHA512 2a6dc1c7851424aa746459e0b0d12bf8418e61c319b9a2918567781712618a382cfae719c56348460ede2fe06987c723a39834bb80ea5606918901a0fb182978
+AUX datasets-2.16.0-tests.patch 5619 BLAKE2B b143a97aa1ce2a4f793370557b18cb948872c856e20129c9d29159a531a1f1a7781957c8eb64420622865d52510376b1d7c757f51d6f0320613200e739212135 SHA512 a865fcb7e0d0437b1dc6cbac905772b5fdec2107f27abe331419111b49ffff237b877568edea354f57165ce7b66df0d3ac2506966259c229495f34722de23e9e
DIST datasets-2.16.0.gh.tar.gz 2163874 BLAKE2B baec91a0e39fac3e07f11e352a286c0940cbc672e7233267e70d1abb64dd31bae18c55213a20fafaeaf2f60268104f294c77c9b73ddc1b289175904288a7c440 SHA512 f2a17ffab192163cfc196cc2bad0adb2ca657b5cf911f74f299b6e29eb4fcfacc377505b1857974a6b55252eedf8775a8706f9e991450c55e5d613020dc03735
-EBUILD datasets-2.16.0.ebuild 1877 BLAKE2B f785451bcb81a275e6731496499ac948a9e1bb74d0cd8e509d29cfde50f8cf1d195b30a5aa1ed1b02ebc8da70da79f4af4c719359e633580e9400a7c54101fb3 SHA512 719d33d9f01c68e430341b6d8d2871fd5eeb8fa97e473125df44c0eed0cc1517d2f426df489df60031f6348a174a37aef26482da204c47c30a72f8fb9b12fe38
+EBUILD datasets-2.16.0.ebuild 2304 BLAKE2B 4af6b595b3344e5df5ddcc46146d637e1091a1b7237e4ab2c3f49299a14c22af32dd180902cac16a3723e3a67613951325d8e47481e0bd5081ce6daa9741286a SHA512 1c7df28fd520eb30bb40f42347cfa106d3f12bbc5c8b9eaf93cfcf956078ab2217b51f50f865949b0bd90e85f569adb59ad06ef350db79038dfa38ccb549b2c2
MISC metadata.xml 379 BLAKE2B 48ebb9e7bfa8b58b0d15b82c4146def465e08cf3212ab4af04129d09c153b67b00d0fa05b94d6af54f643ec3a202f2335d3254b966f49d1394d3c7b9e5da56a5 SHA512 99560decfaa0e438980f372d99257695e9ca9585167d9aba091e0b775c2f8384657ddc017841c8f06f8b568017a54fb9e31da736f3c875da717e154cdce876d1
diff --git a/sci-libs/datasets/datasets-2.16.0.ebuild b/sci-libs/datasets/datasets-2.16.0.ebuild
index 0325b5ae63d6..a34fcaa2f89c 100644
--- a/sci-libs/datasets/datasets-2.16.0.ebuild
+++ b/sci-libs/datasets/datasets-2.16.0.ebuild
@@ -66,4 +66,18 @@ src_prepare() {
sed -i -e \
"/pyarrow_hotfix/d" \
src/datasets/features/features.py || die
+ sed -i \
+ -e "s:pytest.mark.integration:pytest.mark.skip():g" \
+ tests/test_arrow_dataset.py \
+ tests/test_fingerprint.py \
+ tests/test_hf_gcp.py \
+ tests/test_inspect.py \
+ tests/test_iterable_dataset.py \
+ tests/test_iterable_dataset.py \
+ tests/test_load.py \
+ tests/test_offline_util.py \
+ tests/test_streaming_download_manager.py \
+ tests/commands/test_test.py \
+ tests/packaged_modules/test_cache.py \
+ die
}
diff --git a/sci-libs/datasets/files/datasets-2.16.0-tests.patch b/sci-libs/datasets/files/datasets-2.16.0-tests.patch
index 6b2845bce168..8cb89e824b3b 100644
--- a/sci-libs/datasets/files/datasets-2.16.0-tests.patch
+++ b/sci-libs/datasets/files/datasets-2.16.0-tests.patch
@@ -10,51 +10,72 @@
],
--- a/tests/test_load.py 2024-02-20 22:12:13.699209107 +0100
+++ b/tests/test_load.py 2024-02-20 22:13:10.862626708 +0100
-@@ -386,21 +386,6 @@
+@@ -386,6 +386,7 @@
hf_modules_cache=self.hf_modules_cache,
)
-- def test_HubDatasetModuleFactoryWithScript_dont_trust_remote_code(self):
-- # "squad" has a dataset script
-- factory = HubDatasetModuleFactoryWithScript(
-- "squad", download_config=self.download_config, dynamic_modules_path=self.dynamic_modules_path
-- )
-- with patch.object(config, "HF_DATASETS_TRUST_REMOTE_CODE", None): # this will be the default soon
-- self.assertRaises(ValueError, factory.get_module)
-- factory = HubDatasetModuleFactoryWithScript(
-- "squad",
-- download_config=self.download_config,
-- dynamic_modules_path=self.dynamic_modules_path,
-- trust_remote_code=False,
-- )
-- self.assertRaises(ValueError, factory.get_module)
--
++ @pytest.mark.skip(reason="")
+ def test_HubDatasetModuleFactoryWithScript_dont_trust_remote_code(self):
+ # "squad" has a dataset script
+ factory = HubDatasetModuleFactoryWithScript(
+@@ -402,6 +402,7 @@
+ )
+ self.assertRaises(ValueError, factory.get_module)
+
++ @pytest.mark.skip()
def test_HubDatasetModuleFactoryWithScript_with_github_dataset(self):
# "wmt_t2t" has additional imports (internal)
factory = HubDatasetModuleFactoryWithScript(
-@@ -1235,12 +1235,6 @@
-
-
- @pytest.mark.integration
--def test_load_streaming_private_dataset_with_zipped_data(hf_token, hf_private_dataset_repo_zipped_txt_data):
-- ds = load_dataset(hf_private_dataset_repo_zipped_txt_data, streaming=True, token=hf_token)
-- assert next(iter(ds)) is not None
--
--
--@pytest.mark.integration
- def test_load_dataset_config_kwargs_passed_as_arguments():
- ds_default = load_dataset(SAMPLE_DATASET_IDENTIFIER4)
- ds_custom = load_dataset(SAMPLE_DATASET_IDENTIFIER4, drop_metadata=True)
+@@ -411,6 +412,7 @@
+ assert importlib.import_module(module_factory_result.module_path) is not None
+ assert module_factory_result.builder_kwargs["base_path"].startswith(config.HF_ENDPOINT)
+
++ @pytest.mark.skip()
+ def test_GithubMetricModuleFactory_with_internal_import(self):
+ # "squad_v2" requires additional imports (internal)
+ factory = GithubMetricModuleFactory(
+@@ -419,6 +421,7 @@
+ module_factory_result = factory.get_module()
+ assert importlib.import_module(module_factory_result.module_path) is not None
+
++ @pytest.mark.skip()
+ @pytest.mark.filterwarnings("ignore:GithubMetricModuleFactory is deprecated:FutureWarning")
+ def test_GithubMetricModuleFactory_with_external_import(self):
+ # "bleu" requires additional imports (external from github)
+@@ -1032,6 +1035,7 @@
+ datasets.load_dataset_builder(SAMPLE_DATASET_TWO_CONFIG_IN_METADATA, "non-existing-config")
+
+
++@pytest.mark.skip()
+ @pytest.mark.parametrize("serializer", [pickle, dill])
+ def test_load_dataset_builder_with_metadata_configs_pickable(serializer):
+ builder = datasets.load_dataset_builder(SAMPLE_DATASET_SINGLE_CONFIG_IN_METADATA)
+@@ -1153,6 +1157,7 @@
+ assert len(builder.config.data_files["test"]) > 0
+
+
++@pytest.mark.skip()
+ def test_load_dataset_builder_fail():
+ with pytest.raises(DatasetNotFoundError):
+ datasets.load_dataset_builder("blabla")
+@@ -1168,6 +1173,7 @@
+ assert isinstance(next(iter(dataset["train"])), dict)
+
+
++@pytest.mark.skip()
+ def test_load_dataset_cached_local_script(dataset_loading_script_dir, data_dir, caplog):
+ dataset = load_dataset(dataset_loading_script_dir, data_dir=data_dir)
+ assert isinstance(dataset, DatasetDict)
--- a/tests/test_hf_gcp.py 2024-02-21 09:59:26.918397895 +0100
+++ b/tests/test_hf_gcp.py 2024-02-21 09:59:46.335100597 +0100
-@@ -21,7 +21,6 @@
- {"dataset": "wikipedia", "config_name": "20220301.frr"},
- {"dataset": "wikipedia", "config_name": "20220301.it"},
- {"dataset": "wikipedia", "config_name": "20220301.simple"},
-- {"dataset": "eli5", "config_name": "LFQA_reddit"},
- {"dataset": "wiki40b", "config_name": "en"},
- {"dataset": "wiki_dpr", "config_name": "psgs_w100.nq.compressed"},
- {"dataset": "wiki_dpr", "config_name": "psgs_w100.nq.no_index"},
+@@ -47,6 +47,7 @@
+ ]
+
+
++@pytest.mark.skip("network")
+ @parameterized.named_parameters(list_datasets_on_hf_gcp_parameters(with_config=True))
+ class TestDatasetOnHfGcp(TestCase):
+ dataset = None
--- a/tests/test_inspect.py 2024-02-21 10:03:32.315520016 +0100
+++ b/tests/test_inspect.py 2024-02-21 10:03:50.345553490 +0100
@@ -18,7 +18,7 @@
@@ -66,24 +87,47 @@
def test_inspect_dataset(path, tmp_path):
inspect_dataset(path, tmp_path)
script_name = Path(path).stem + ".py"
---- a/tests/packaged_modules/test_cache.py 2024-02-21 12:04:18.036866572 +0100
-+++ b/tests/packaged_modules/test_cache.py 2024-02-21 12:04:54.333558520 +0100
-@@ -44,18 +44,3 @@
- Cache(dataset_name=text_dir.name, hash="missing").download_and_prepare()
- with pytest.raises(ValueError):
- Cache(dataset_name=text_dir.name, config_name="missing", version="auto", hash="auto").download_and_prepare()
--
--
--@pytest.mark.integration
--def test_cache_multi_configs():
-- repo_id = SAMPLE_DATASET_TWO_CONFIG_IN_METADATA
-- dataset_name = repo_id.split("/")[-1]
-- config_name = "v1"
-- ds = load_dataset(repo_id, config_name)
-- cache = Cache(dataset_name=dataset_name, repo_id=repo_id, config_name=config_name, version="auto", hash="auto")
-- reloaded = cache.as_dataset()
-- assert list(ds) == list(reloaded)
-- assert len(ds["train"]) == len(reloaded["train"])
-- with pytest.raises(ValueError) as excinfo:
-- Cache(dataset_name=dataset_name, repo_id=repo_id, config_name="missing", version="auto", hash="auto")
-- assert config_name in str(excinfo.value)
+@@ -49,6 +49,7 @@
+ assert list(info.splits.keys()) == expected_splits
+
+
++@pytest.mark.skip(reason="require network")
+ def test_get_dataset_config_info_private(hf_token, hf_private_dataset_repo_txt_data):
+ info = get_dataset_config_info(hf_private_dataset_repo_txt_data, config_name="default", token=hf_token)
+ assert list(info.splits.keys()) == ["train"]
+--- a/tests/test_data_files.py 2024-02-21 20:22:57.536160356 +0100
++++ b/tests/test_data_files.py 2024-02-21 20:25:00.153052174 +0100
+@@ -378,6 +378,7 @@
+ assert len(hub_dataset_repo_patterns_results[pattern]) == 0
+
+
++@pytest.mark.skip(reason="network")
+ def test_DataFilesList_from_patterns_locally_with_extra_files(complex_data_dir, text_file):
+ data_files_list = DataFilesList.from_patterns([_TEST_URL, text_file.as_posix()], complex_data_dir)
+ assert list(data_files_list) == [_TEST_URL, text_file.as_posix()]
+@@ -467,6 +468,7 @@
+ assert Hasher.hash(data_files1) != Hasher.hash(data_files2)
+
+
++@pytest.mark.skip(reason="network")
+ def test_DataFilesDict_from_patterns_locally_or_remote_hashing(text_file):
+ patterns = {"train": [_TEST_URL], "test": [str(text_file)]}
+ data_files1 = DataFilesDict.from_patterns(patterns)
+--- a/tests/packaged_modules/test_folder_based_builder.py 2024-02-21 21:30:20.718922523 +0100
++++ b/tests/packaged_modules/test_folder_based_builder.py 2024-02-21 21:31:46.309061287 +0100
+@@ -382,6 +382,7 @@
+ assert example[column] is not None
+
+
++@pytest.mark.skip(reason="network")
+ @pytest.mark.parametrize("remote", [True, False])
+ @pytest.mark.parametrize("drop_labels", [None, True, False])
+ def test_data_files_with_different_levels_no_metadata(
+@@ -405,6 +406,7 @@
+ assert all(example.keys() == {"base", "label"} for _, example in generator)
+
+
++@pytest.mark.skip(reason="network")
+ @pytest.mark.parametrize("remote", [False, True])
+ @pytest.mark.parametrize("drop_labels", [None, True, False])
+ def test_data_files_with_one_label_no_metadata(data_files_with_one_label_no_metadata, drop_labels, remote, cache_dir):