From f2d1966a40070fb5ba3279db49b8435c7c143802 Mon Sep 17 00:00:00 2001 From: V3n3RiX Date: Wed, 21 Feb 2024 17:40:23 +0000 Subject: gentoo auto-resync : 21:02:2024 - 17:40:23 --- sci-libs/datasets/Manifest | 6 +- sci-libs/datasets/datasets-2.15.0.ebuild | 69 ----------------- sci-libs/datasets/datasets-2.16.0.ebuild | 69 +++++++++++++++++ .../datasets/files/datasets-2.15.0-tests.patch | 46 ----------- .../datasets/files/datasets-2.16.0-tests.patch | 89 ++++++++++++++++++++++ 5 files changed, 161 insertions(+), 118 deletions(-) delete mode 100644 sci-libs/datasets/datasets-2.15.0.ebuild create mode 100644 sci-libs/datasets/datasets-2.16.0.ebuild delete mode 100644 sci-libs/datasets/files/datasets-2.15.0-tests.patch create mode 100644 sci-libs/datasets/files/datasets-2.16.0-tests.patch (limited to 'sci-libs/datasets') diff --git a/sci-libs/datasets/Manifest b/sci-libs/datasets/Manifest index 1ec11d3f0d8f..968588f01e46 100644 --- a/sci-libs/datasets/Manifest +++ b/sci-libs/datasets/Manifest @@ -1,5 +1,5 @@ AUX datasets-2.14.4-tests.patch 8616 BLAKE2B 8a65d1315b27658a5f741ebc022c83692252a4833ec4d7b79873799c2bb4bb68534a9e13e7fae1c9a6c051b3615fbb783e6e7885ed93968b31aea6629b4116c4 SHA512 2c6d27c297995466a0aebefa46e86113bdce7d84ea00bb1630549fc379fbb51d66f8f01a8d098d56ec2b26d5200f129460567abdbf6a63d4e2a61372fbfbc6a3 -AUX datasets-2.15.0-tests.patch 1949 BLAKE2B 966b7077ffc3182047717205300bfb630180d10df82c9a1296fc46bce329b33b1506d3f58a3a569cff8c333845b2221d7ef1c85602ffeba4f9130954b23f5c93 SHA512 0c298cf4e05c48e890bd05777804d2ac64e16b8750b4b19aef6a410e3772dc11166df5c8ab0a32425bdcb710757ffab82eaf04be215eacf9d6facdedaefc304f -DIST datasets-2.15.0.gh.tar.gz 2147191 BLAKE2B eadf0133f0baa9f0469a51f28e00d3656b2b799ed1ff221ad6df39640c9777ccd46b706e46898ffa0597bc43288ee5991410d5c6d0a2cb3b814658c92d779a68 SHA512 589ca7992d58007c556558ef0889354fe34821f55e79025ea475d08c105428fe84c77c9183ec0028d8e60b25ba0ea8565bd8c6003a85bb6472d1cb4a247142e2 -EBUILD datasets-2.15.0.ebuild 1885 BLAKE2B 55023520851c42d9e247a74e43efedcf74afa9a7ac5953b13f9770ac27b577dcb1e9068a352d5dc09d90440735a82cab805a38593093588a620c86325cf90273 SHA512 c959852bce1ccb2352ff4cfb9eff1311b00156b2c3d585806db39c6d500cc766ecfe31bf4f776e8f03ab89793dec4a28abe48f0843994e78d8ccf32c08b586a1 +AUX datasets-2.16.0-tests.patch 4104 BLAKE2B d58e08650467eaf589c72f4270980436c3a7ae72faf6ce433d0b3f4f136367b52041f80e51c6b141b655b3789111e60025d512a01d497c402bf7c74309370287 SHA512 2a6dc1c7851424aa746459e0b0d12bf8418e61c319b9a2918567781712618a382cfae719c56348460ede2fe06987c723a39834bb80ea5606918901a0fb182978 +DIST datasets-2.16.0.gh.tar.gz 2163874 BLAKE2B baec91a0e39fac3e07f11e352a286c0940cbc672e7233267e70d1abb64dd31bae18c55213a20fafaeaf2f60268104f294c77c9b73ddc1b289175904288a7c440 SHA512 f2a17ffab192163cfc196cc2bad0adb2ca657b5cf911f74f299b6e29eb4fcfacc377505b1857974a6b55252eedf8775a8706f9e991450c55e5d613020dc03735 +EBUILD datasets-2.16.0.ebuild 1877 BLAKE2B f785451bcb81a275e6731496499ac948a9e1bb74d0cd8e509d29cfde50f8cf1d195b30a5aa1ed1b02ebc8da70da79f4af4c719359e633580e9400a7c54101fb3 SHA512 719d33d9f01c68e430341b6d8d2871fd5eeb8fa97e473125df44c0eed0cc1517d2f426df489df60031f6348a174a37aef26482da204c47c30a72f8fb9b12fe38 MISC metadata.xml 379 BLAKE2B 48ebb9e7bfa8b58b0d15b82c4146def465e08cf3212ab4af04129d09c153b67b00d0fa05b94d6af54f643ec3a202f2335d3254b966f49d1394d3c7b9e5da56a5 SHA512 99560decfaa0e438980f372d99257695e9ca9585167d9aba091e0b775c2f8384657ddc017841c8f06f8b568017a54fb9e31da736f3c875da717e154cdce876d1 diff --git a/sci-libs/datasets/datasets-2.15.0.ebuild b/sci-libs/datasets/datasets-2.15.0.ebuild deleted file mode 100644 index 52af2f93ac88..000000000000 --- a/sci-libs/datasets/datasets-2.15.0.ebuild +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright 2023-2024 Gentoo Authors -# Distributed under the terms of the GNU General Public License v2 - -EAPI=8 - -DISTUTILS_USE_PEP517=setuptools -PYTHON_COMPAT=( python3_{9..11} ) -DISTUTILS_SINGLE_IMPL=1 -inherit distutils-r1 - -DESCRIPTION="Access and share datasets for Audio, Computer Vision, and NLP tasks" -HOMEPAGE=" - https://pypi.org/project/datasets/ -" -SRC_URI="https://github.com/huggingface/${PN}/archive/refs/tags/${PV}.tar.gz - -> ${P}.gh.tar.gz" -IUSE="test" - -LICENSE="Apache-2.0" -SLOT="0" -KEYWORDS="~amd64" - -# For pin on fsspec see https://github.com/huggingface/datasets/issues/6333 -RDEPEND=" - ${PYTHON_DEPS} - sci-libs/pytorch[${PYTHON_SINGLE_USEDEP}] - $(python_gen_cond_dep ' - dev-python/absl-py[${PYTHON_USEDEP}] - dev-python/aiohttp[${PYTHON_USEDEP}] - <=dev-python/fsspec-2023.10.0[${PYTHON_USEDEP}] - dev-python/multiprocess[${PYTHON_USEDEP}] - dev-python/packaging[${PYTHON_USEDEP}] - dev-python/pandas[${PYTHON_USEDEP}] - dev-python/pyarrow[${PYTHON_USEDEP},parquet,snappy] - dev-python/pyyaml[${PYTHON_USEDEP}] - dev-python/tqdm[${PYTHON_USEDEP}] - dev-python/xxhash[${PYTHON_USEDEP}] - dev-python/zstandard[${PYTHON_USEDEP}] - >=sci-libs/huggingface_hub-0.14.0[${PYTHON_USEDEP}] - sci-libs/scikit-learn[${PYTHON_USEDEP}] - ') -" -DEPEND="${RDEPEND}" -BDEPEND="test? ( - $(python_gen_cond_dep ' - dev-python/absl-py[${PYTHON_USEDEP}] - dev-python/pytest-datadir[${PYTHON_USEDEP}] - dev-python/decorator[${PYTHON_USEDEP}] - =dev-python/sqlalchemy-1*[${PYTHON_USEDEP}] - sci-libs/jiwer[${PYTHON_USEDEP}] - sci-libs/seqeval[${PYTHON_USEDEP}] - ') -)" - -PATCHES=( - "${FILESDIR}"/${PN}-2.14.4-tests.patch - "${FILESDIR}"/${P}-tests.patch -) - -distutils_enable_tests pytest - -src_prepare() { - distutils-r1_src_prepare - rm tests/packaged_modules/test_spark.py || die - rm tests/test_upstream_hub.py || die - sed -i -e \ - "/pyarrow_hotfix/d" \ - src/datasets/features/features.py || die -} diff --git a/sci-libs/datasets/datasets-2.16.0.ebuild b/sci-libs/datasets/datasets-2.16.0.ebuild new file mode 100644 index 000000000000..0325b5ae63d6 --- /dev/null +++ b/sci-libs/datasets/datasets-2.16.0.ebuild @@ -0,0 +1,69 @@ +# Copyright 2023-2024 Gentoo Authors +# Distributed under the terms of the GNU General Public License v2 + +EAPI=8 + +DISTUTILS_USE_PEP517=setuptools +PYTHON_COMPAT=( python3_{10..12} ) +DISTUTILS_SINGLE_IMPL=1 +inherit distutils-r1 + +DESCRIPTION="Access and share datasets for Audio, Computer Vision, and NLP tasks" +HOMEPAGE=" + https://pypi.org/project/datasets/ +" +SRC_URI="https://github.com/huggingface/${PN}/archive/refs/tags/${PV}.tar.gz + -> ${P}.gh.tar.gz" +IUSE="test" + +LICENSE="Apache-2.0" +SLOT="0" +KEYWORDS="~amd64" + +# For pin on fsspec see https://github.com/huggingface/datasets/issues/6333 +RDEPEND=" + ${PYTHON_DEPS} + sci-libs/pytorch[${PYTHON_SINGLE_USEDEP}] + $(python_gen_cond_dep ' + dev-python/absl-py[${PYTHON_USEDEP}] + dev-python/aiohttp[${PYTHON_USEDEP}] + <=dev-python/fsspec-2023.10.0[${PYTHON_USEDEP}] + dev-python/multiprocess[${PYTHON_USEDEP}] + dev-python/packaging[${PYTHON_USEDEP}] + dev-python/pandas[${PYTHON_USEDEP}] + dev-python/pyarrow[${PYTHON_USEDEP},parquet,snappy] + dev-python/pyyaml[${PYTHON_USEDEP}] + dev-python/tqdm[${PYTHON_USEDEP}] + dev-python/xxhash[${PYTHON_USEDEP}] + dev-python/zstandard[${PYTHON_USEDEP}] + sci-libs/huggingface_hub[${PYTHON_USEDEP}] + sci-libs/scikit-learn[${PYTHON_USEDEP}] + ') +" +DEPEND="${RDEPEND}" +BDEPEND="test? ( + $(python_gen_cond_dep ' + dev-python/absl-py[${PYTHON_USEDEP}] + dev-python/pytest-datadir[${PYTHON_USEDEP}] + dev-python/decorator[${PYTHON_USEDEP}] + =dev-python/sqlalchemy-1*[${PYTHON_USEDEP}] + sci-libs/jiwer[${PYTHON_USEDEP}] + sci-libs/seqeval[${PYTHON_USEDEP}] + ') +)" + +PATCHES=( + "${FILESDIR}"/${PN}-2.14.4-tests.patch + "${FILESDIR}"/${P}-tests.patch +) + +distutils_enable_tests pytest + +src_prepare() { + distutils-r1_src_prepare + rm tests/packaged_modules/test_spark.py || die + rm tests/test_upstream_hub.py || die + sed -i -e \ + "/pyarrow_hotfix/d" \ + src/datasets/features/features.py || die +} diff --git a/sci-libs/datasets/files/datasets-2.15.0-tests.patch b/sci-libs/datasets/files/datasets-2.15.0-tests.patch deleted file mode 100644 index 64d8dcfdc8d8..000000000000 --- a/sci-libs/datasets/files/datasets-2.15.0-tests.patch +++ /dev/null @@ -1,46 +0,0 @@ ---- a/tests/test_arrow_dataset.py 2024-02-20 21:53:24.248470991 +0100 -+++ b/tests/test_arrow_dataset.py 2024-02-20 21:53:29.441804737 +0100 -@@ -3978,7 +3978,6 @@ - [ - "relative/path", - "/absolute/path", -- "s3://bucket/relative/path", - "hdfs://relative/path", - "hdfs:///absolute/path", - ], ---- a/tests/test_hf_gcp.py 2024-02-20 21:55:18.821852434 +0100 -+++ b/tests/test_hf_gcp.py 2024-02-20 21:55:46.525186394 +0100 -@@ -22,7 +22,6 @@ - {"dataset": "wikipedia", "config_name": "20220301.it"}, - {"dataset": "wikipedia", "config_name": "20220301.simple"}, - {"dataset": "snli", "config_name": "plain_text"}, -- {"dataset": "eli5", "config_name": "LFQA_reddit"}, - {"dataset": "wiki40b", "config_name": "en"}, - {"dataset": "wiki_dpr", "config_name": "psgs_w100.nq.compressed"}, - {"dataset": "wiki_dpr", "config_name": "psgs_w100.nq.no_index"}, ---- a/tests/test_inspect.py 2024-02-20 22:01:35.148488467 +0100 -+++ b/tests/test_inspect.py 2024-02-20 22:02:14.458561571 +0100 -@@ -15,7 +15,7 @@ - pytestmark = pytest.mark.integration - - --@pytest.mark.parametrize("path", ["paws", "csv"]) -+@pytest.mark.parametrize("path", ["csv"]) - def test_inspect_dataset(path, tmp_path): - inspect_dataset(path, tmp_path) - script_name = path + ".py" ---- a/tests/test_load.py 2024-02-20 22:12:13.699209107 +0100 -+++ b/tests/test_load.py 2024-02-20 22:13:10.862626708 +0100 -@@ -1235,12 +1235,6 @@ - - - @pytest.mark.integration --def test_load_streaming_private_dataset_with_zipped_data(hf_token, hf_private_dataset_repo_zipped_txt_data): -- ds = load_dataset(hf_private_dataset_repo_zipped_txt_data, streaming=True, token=hf_token) -- assert next(iter(ds)) is not None -- -- --@pytest.mark.integration - def test_load_dataset_config_kwargs_passed_as_arguments(): - ds_default = load_dataset(SAMPLE_DATASET_IDENTIFIER4) - ds_custom = load_dataset(SAMPLE_DATASET_IDENTIFIER4, drop_metadata=True) diff --git a/sci-libs/datasets/files/datasets-2.16.0-tests.patch b/sci-libs/datasets/files/datasets-2.16.0-tests.patch new file mode 100644 index 000000000000..6b2845bce168 --- /dev/null +++ b/sci-libs/datasets/files/datasets-2.16.0-tests.patch @@ -0,0 +1,89 @@ +--- a/tests/test_arrow_dataset.py 2024-02-20 21:53:24.248470991 +0100 ++++ b/tests/test_arrow_dataset.py 2024-02-20 21:53:29.441804737 +0100 +@@ -3982,7 +3982,6 @@ + [ + "relative/path", + "/absolute/path", +- "s3://bucket/relative/path", + "hdfs://relative/path", + "hdfs:///absolute/path", + ], +--- a/tests/test_load.py 2024-02-20 22:12:13.699209107 +0100 ++++ b/tests/test_load.py 2024-02-20 22:13:10.862626708 +0100 +@@ -386,21 +386,6 @@ + hf_modules_cache=self.hf_modules_cache, + ) + +- def test_HubDatasetModuleFactoryWithScript_dont_trust_remote_code(self): +- # "squad" has a dataset script +- factory = HubDatasetModuleFactoryWithScript( +- "squad", download_config=self.download_config, dynamic_modules_path=self.dynamic_modules_path +- ) +- with patch.object(config, "HF_DATASETS_TRUST_REMOTE_CODE", None): # this will be the default soon +- self.assertRaises(ValueError, factory.get_module) +- factory = HubDatasetModuleFactoryWithScript( +- "squad", +- download_config=self.download_config, +- dynamic_modules_path=self.dynamic_modules_path, +- trust_remote_code=False, +- ) +- self.assertRaises(ValueError, factory.get_module) +- + def test_HubDatasetModuleFactoryWithScript_with_github_dataset(self): + # "wmt_t2t" has additional imports (internal) + factory = HubDatasetModuleFactoryWithScript( +@@ -1235,12 +1235,6 @@ + + + @pytest.mark.integration +-def test_load_streaming_private_dataset_with_zipped_data(hf_token, hf_private_dataset_repo_zipped_txt_data): +- ds = load_dataset(hf_private_dataset_repo_zipped_txt_data, streaming=True, token=hf_token) +- assert next(iter(ds)) is not None +- +- +-@pytest.mark.integration + def test_load_dataset_config_kwargs_passed_as_arguments(): + ds_default = load_dataset(SAMPLE_DATASET_IDENTIFIER4) + ds_custom = load_dataset(SAMPLE_DATASET_IDENTIFIER4, drop_metadata=True) +--- a/tests/test_hf_gcp.py 2024-02-21 09:59:26.918397895 +0100 ++++ b/tests/test_hf_gcp.py 2024-02-21 09:59:46.335100597 +0100 +@@ -21,7 +21,6 @@ + {"dataset": "wikipedia", "config_name": "20220301.frr"}, + {"dataset": "wikipedia", "config_name": "20220301.it"}, + {"dataset": "wikipedia", "config_name": "20220301.simple"}, +- {"dataset": "eli5", "config_name": "LFQA_reddit"}, + {"dataset": "wiki40b", "config_name": "en"}, + {"dataset": "wiki_dpr", "config_name": "psgs_w100.nq.compressed"}, + {"dataset": "wiki_dpr", "config_name": "psgs_w100.nq.no_index"}, +--- a/tests/test_inspect.py 2024-02-21 10:03:32.315520016 +0100 ++++ b/tests/test_inspect.py 2024-02-21 10:03:50.345553490 +0100 +@@ -18,7 +18,7 @@ + pytestmark = pytest.mark.integration + + +-@pytest.mark.parametrize("path", ["paws", csv.__file__]) ++@pytest.mark.parametrize("path", [csv.__file__]) + def test_inspect_dataset(path, tmp_path): + inspect_dataset(path, tmp_path) + script_name = Path(path).stem + ".py" +--- a/tests/packaged_modules/test_cache.py 2024-02-21 12:04:18.036866572 +0100 ++++ b/tests/packaged_modules/test_cache.py 2024-02-21 12:04:54.333558520 +0100 +@@ -44,18 +44,3 @@ + Cache(dataset_name=text_dir.name, hash="missing").download_and_prepare() + with pytest.raises(ValueError): + Cache(dataset_name=text_dir.name, config_name="missing", version="auto", hash="auto").download_and_prepare() +- +- +-@pytest.mark.integration +-def test_cache_multi_configs(): +- repo_id = SAMPLE_DATASET_TWO_CONFIG_IN_METADATA +- dataset_name = repo_id.split("/")[-1] +- config_name = "v1" +- ds = load_dataset(repo_id, config_name) +- cache = Cache(dataset_name=dataset_name, repo_id=repo_id, config_name=config_name, version="auto", hash="auto") +- reloaded = cache.as_dataset() +- assert list(ds) == list(reloaded) +- assert len(ds["train"]) == len(reloaded["train"]) +- with pytest.raises(ValueError) as excinfo: +- Cache(dataset_name=dataset_name, repo_id=repo_id, config_name="missing", version="auto", hash="auto") +- assert config_name in str(excinfo.value) -- cgit v1.2.3