From 93a93e9a3b53c1a73142a305ea1f8136846942ee Mon Sep 17 00:00:00 2001 From: V3n3RiX Date: Wed, 22 Dec 2021 14:08:05 +0000 Subject: gentoo resync : 22.12.2021 --- dev-python/nltk-data/Manifest | 10 +- dev-python/nltk-data/nltk-data-20211023.ebuild | 203 ------------------------ dev-python/nltk-data/nltk-data-20211221.ebuild | 210 +++++++++++++++++++++++++ 3 files changed, 216 insertions(+), 207 deletions(-) delete mode 100644 dev-python/nltk-data/nltk-data-20211023.ebuild create mode 100644 dev-python/nltk-data/nltk-data-20211221.ebuild (limited to 'dev-python/nltk-data') diff --git a/dev-python/nltk-data/Manifest b/dev-python/nltk-data/Manifest index c8d69b0aadf5..59c9a9692918 100644 --- a/dev-python/nltk-data/Manifest +++ b/dev-python/nltk-data/Manifest @@ -29,7 +29,7 @@ DIST nltk-gazetteers-20200312.zip 8265 BLAKE2B aa4f8db99a7ae8c26911d33abb67713bc DIST nltk-genesis-20200312.zip 473239 BLAKE2B 5234c2bcc31ae738bbd6cdfe1ee231a75ceacc74334e3b03fcefb409427406aee1700e0d8ca8fad1444736005b53fdb0fbdb9068085837f4bf7292cf61351b8b SHA512 e74f6a8f304bca292bf8e138a739a97633075cf9374c95f43ec75473cd7bbeb0921fe24f7864659e640e694d227e61debd67bc41150bd898b7241a274e28196f DIST nltk-gutenberg-20200312.zip 4251829 BLAKE2B 54448e919010f12faa92c0ffd4adb31a7d1d76c262d5ac99c10129d188830107621f90e4ba611ffc1541c25037fc916b451169a8ae1df276747890fea2b5e06f SHA512 7b4442d18b89f9bb8261f23ad2f4d513b470461255633cbebe2b0c310f003effe819f7a86b39a62a6cde1b3fdbf099dabbdaaed520268c99fd1bb933baa97ead DIST nltk-ieer-20200312.zip 166156 BLAKE2B 4c63be6f0f16e59584b7f97c21da0cf60a1efd3678ce662d683d8a6a9d05d333690b26ebb037d3047b26e3b51163ffa666f048ad8a89ee6c851ddd3edd2faf58 SHA512 b55545f0ac03282fb85c4a86bb69fd9988db8cc504114bff7b763007e20f559efa61b8ae5b7955c251c1c2dd9d23246e23515b1cd5d8d28225c20ec236c05cd8 -DIST nltk-inaugural-20200312.zip 329806 BLAKE2B 222e22742d93156b852ddd72ffccdd5be5c907f43973b0ceee8d8d383bc5387575d1f974b29fd6d59499ec86483f56270251e6a5e48d7840c577207849baed59 SHA512 096591769373019883b9a2f85d32bc86021aefb74deb26dc9005233ae7f317d587c247edc53f1a956b8e29911656419d9677c271fb59ded4ca4edf2a80ff86b7 +DIST nltk-inaugural-20211221.zip 346476 BLAKE2B 6f14827fcd8edf9300725c6ffcfa899f5878537aaefc40b165581af60866ca7e7b0ed316a2b8dd87ff858667ce7a5f4035090a1cc3a404afa64b28d34dcf8747 SHA512 5440bff001209e42ee86c2501d2cd628cf639734c194c11097a862f6759f09a6adef1da0702b77211774f84d5d9d4ee55e89085b39d8a911811def1bd07ac8e2 DIST nltk-indian-20200312.zip 199187 BLAKE2B 9f70f1f28bcb96d828ca2ed96d6f89cd8eca8391b64842c742e940aaf59f31e142a4919ab5a21bce954bbe2ed85add162ac67c9549758f9aa44b3e90d45d44a7 SHA512 08e9342567bcc607e9007aea8415af2cad1f8a289104b47a61bf569f1391629ebbe25f19aacf53e2b14725edfdf18076331f044e1c2e42d07c96d5c875ff8f9e DIST nltk-jeita-20200312.zip 16531215 BLAKE2B 7cf98b389d5c4b41d9d613c14c5116b83c520cfe7418ef982a6acc1294d8ecdb8a5d6d396409c9f13ca83df999793619d5eb5483612e87874766afe565c68362 SHA512 491e6dc27d70ebfd139be4dcd179b02c383f38edbe4290aed85638ecd7c987259b025e4f6f3f1d5bfe2b7aa3eff7188109205417d0acb64759988150f2753129 DIST nltk-kimmo-20200312.zip 186958 BLAKE2B ca288f48c09a33b86449c780e5c8612c3521e22bfe70918629bf3144b75b5566a942c3cc1440a13c2d81e3a69c77928c336850cc9fac4c232deb77293bfbe4e6 SHA512 fe38d5298ffb6dd62223c2002278d9ddf2695f341a29477f44434644bc6766cfe2c73091067f9eb8aea201735eef5012f96ba267f2e6ea3aec41ac206a43cac2 @@ -49,6 +49,7 @@ DIST nltk-names-20200312.zip 21326 BLAKE2B 37389f44337724fac26eaad5a4c310ea244b4 DIST nltk-nombank.1.0-20200312.zip 6728397 BLAKE2B 4dfb61fb651817086ad20957d02a39c99dff83ec45dc1a01e1dac055e0d7d9ef8108ef8df5218ce57b2b0866b748f5f76663a58e08f5e1731272173888e714e9 SHA512 81bcf29898ba1205ccbc8486d14518e95b0189b97b172a54580d4cb4584f4ef32a5284fe0abed0b954e67a318de8f10de5df1a20e6c08976df09cd2c9c66cb32 DIST nltk-nonbreaking_prefixes-20200312.zip 25437 BLAKE2B 9f26efbcc429f2d36a5584014604d2ca9d4de3864aa6192a114e9e2cd55ff1df50d4321a8c40dbe96225fad6be51f7be66f8a4d58146e370a1d33a9305f7f95f SHA512 4cddfdacf1c7fe4e93a31333413a83ebea2ea7907f4e6fe99e1c6ff4aed8efc951f4399478cdb88e5d5dc01cfb734b6672f4007b2a17c4784896fecaf4a93cdf DIST nltk-nps_chat-20200312.zip 301366 BLAKE2B eadf0d0ee54c982765a124321fe6de161bfbfbfb0385d5d7e171666945ef313b960ba7b820e4fed02885ffc213173e2a7c97822513ae6e81e68858af21d9f424 SHA512 a716f3aaba3aedbbdaeaff1c6f1ef607476aa989eaef1c2971b827215f3167952477b203f1543d0fd27e5594c3efa01349c6a0efc1ac423048808f18eae94643 +DIST nltk-omw-1.4-20211221.zip 26634772 BLAKE2B 4d5fe648987b9aaa589d612f439560287200fe947a80182b941b0660a97a739ddda00a472209a26715aa72c44f37cdb5dbce9df89cb7bfbcdb7f4fa2908bb96e SHA512 d25c591c894fedeaf8649ef26653c51b74282e8af43bd0a8867eb16f79a89c4b567328a7c83f7651ec9e41dc746cba87b0c8bdbbe736db7f18c62a56820a92e6 DIST nltk-omw-20200312.zip 12110409 BLAKE2B 5ae634fdd89fcd2be2dea1511b2c434755e16b92804ea6bb5e2139113c644885d142e6290ab1d00c418f9df5241ebf6b7e227da120dc95d1e907d8f62a3d0cb8 SHA512 05b1b014e55738c49b34ea0371b95bd1d1aadfb0cc57325ec110ffbbb1250690e3a4aab45e877eb5433de08c7afe3ffa161e29976a0e28db9803fb637b24c843 DIST nltk-opinion_lexicon-20200312.zip 24947 BLAKE2B 47f5a381647075f4bea0230939098a6e83b47860165595e72f076a5436e2aad78148606d86ec331b1cc323963cb2a7500669c6fc2aff2e889bbaec8fd8571722 SHA512 75b98afa24c4e0284b2858770d7655c5c399066f56c1b9aa03dd51d80f211a694856695f8192570f67f02f0827ab02c3f3b65feb5a56c04bebfae3b58ca56ba1 DIST nltk-panlex_swadesh-20200312.zip 2861668 BLAKE2B e9ee3b2296c8738319a86dfb39d34fc5bcd346e725e0ad0df74571106ef05b71126aff70914e914cff436674c0d70c71ca36121331ab0327932505a2bdb26be1 SHA512 d096be1c6ea54814e448f61048f05df9107c0ca5f75eb3051a009efd4643c94188dde03505b0a4ae29d301268ffc5153eeee7012aae18310f9288fd4ccdb2287 @@ -76,7 +77,7 @@ DIST nltk-senseval-20200312.zip 2151350 BLAKE2B ec21463f7ffe27e3dda47e38e3563464 DIST nltk-sentence_polarity-20200312.zip 490256 BLAKE2B 56a1b4668875d5c73b220710e8af84410a273f394d68d7e9decb47c5370f0f7c1982d37710ab903d78dac0108ec2e1a26e8f1cdd3fee172487397c8b2ae01ee7 SHA512 d3772edc13d46fb835f40a9c6714b95885d3ba2eded37a24c8aad6d3e63ca49a9c79387218e2f67bda2e52d52449daf1dd1e603a4819ae46da78f4e0a38ba599 DIST nltk-sentiwordnet-20200312.zip 4686546 BLAKE2B 98a45d9a5be971bd9e76ad5c154359c1800aa96ab1e8e301215e435cae6d659cc0fb38604435da4e9a391741a72ef7fcb842a1cf119348a7039f717843035f12 SHA512 cd0ff31abec3b85e3114546918c9152360e3da45fbbaf7b4c76a13a6d288f1d9ea7a4633638080cd0205cc293b4123445106e9b5daddb2ae379d6a08d1c992a5 DIST nltk-shakespeare-20200312.zip 475458 BLAKE2B c60acbf2d90f5fffdae43bdf85191dbd694ec5ae19e37b145ad04af6d8481988f0c2ce644baff2e31707e86d95a9b3b03c1da4faaa46be92da2291cce565955e SHA512 72cbbdb237b78df1f6bf27b5504606f805433746b9862dd1ce544ab712783a41fb165f3a4f282a7f1bd58c7dfe6036c14ecb07a82c109f6d326eb95e470002fb -DIST nltk-sinica_treebank-20200312.zip 899237 BLAKE2B e074dbdc7f1fe5958109f8ea019bead946bd2e3cd1c0849cf3ad49eaa34a1989baee7935a70e9103c121e5d375954d1dc53e4a1599e394ce6c2fda4e98769645 SHA512 724fb0905cbd0f903c3a75d70aee8a3e6f29e62971c1501efd8bfb3f4c6619e5fd8a285648e7792ea8e86bbda0a2afc6eecae45ad9f361983bf83b756ab52960 +DIST nltk-sinica_treebank-20211221.zip 906706 BLAKE2B 0c00bc7d39765b86eb99f55e82522d390cf0270fd2e2688d5825481b6f11e54aaa71b3f9ac95af170be543353746777da130bb6b294b0529330c712f9edaaab8 SHA512 22d44e23e89691f775a0e54971fada692aecd7cfa5181d49e9a53d0a96dc3be1beaa1e1ec6840f9b2756c5c16c93f96b571f6b08c175701485afc0c32261cac8 DIST nltk-smultron-20200312.zip 166207 BLAKE2B d0c3e75dd108965e260d913e0c02137da8707a751a4f53a1f59ff6a26af9997e2f0193980b5087277b87b74d9aab3e3c8d066d7264311cec2a94f0f9d21caf44 SHA512 8872717994269caff4ccf3f52cef273b2ef007efeb8ab89392cb4db3b8e3ba4b8859ab2fd70762b8941bee4558aebba3021869772be17a64e1056968aa605e19 DIST nltk-snowball_data-20200312.zip 6785405 BLAKE2B 44c10439b142540ac7eece967efa1431fd8f45342f0a90875dacf29ad374fe4c7d30af11d42ba45e0f1ec1836d56b2ff684ee352c5e8536cfb5db5eb7632285b SHA512 6c8a9259d88f6f7f499867d83b731de99d7fa4e8827ecedf836f653fc1a810efa9f6c5c6e2720a9e6610bc00978956b6a119bd08b70e3e241c4e9faccddd81d8 DIST nltk-spanish_grammars-20200312.zip 4047 BLAKE2B d8a8dbb558850a6a60f1fe5ab0f617f3a0f3c64bc7d49980cf793d374c6679d1bd42afb7e61776737b5eec162f2520abf2ee3acc92ea9ee0f397c3089b3b5b28 SHA512 4513347156e9351c259c0e2448198d68354bbc95e0a54561c31a88f13f333ebcba3e294c820cb62036665f2904eb6a7137546cd580e361c0423c30a8aed950b2 @@ -102,9 +103,10 @@ DIST nltk-webtext-20200312.zip 646297 BLAKE2B ca072fc38c144b659c76c36c9161641c91 DIST nltk-wmt15_eval-20200312.zip 383096 BLAKE2B 119943db4240171077569b3302c678644c2c9547ba67bfd055751059e0a3ad3ab6a19e4eedf9108d313d46dcd36cf19e11d973981da8c70a01c4cb790a7bd739 SHA512 362395d4c77ebe92f4c19fa8c2000082dc7a2343acc19cccb596ca00db6c40c231b904d807f46e2691cc4c4a0c79d14873b2a1983a494f2ca1485d540d787ceb DIST nltk-word2vec_sample-20200312.zip 49396025 BLAKE2B 0512b9bb7121a528190079f578e82b4e8f8021bfc0062cfa5613d260f3eee17460aecfeaacb65d950e79d27a653c78633a88c3638ec16377e2dbc3006387ebaa SHA512 09c30a4ab8f9fb6a5b36974b5953260d01cb4f285827fb90a374d054ad775ac978602ab56c452f46f4f8601312e232fd739d2f54dafa44ab8b7b01831cf0d9d1 DIST nltk-wordnet-20200312.zip 10775600 BLAKE2B dac56a8fb1fa6882b1871c394ad2acb2d3be739c424570e27c89fb6983df5f896a8f359092ba82752ddfc0531d83563a219e85f80124202f29bda93181efe4dc SHA512 1923a8bcd56fa0b9a9de91f53070dce28c3a7efbab11d2ef55c87134b1bf30de0f40abab59c39eb15dce54aec9491d8a5a259de212ff4cb25cde0ad09317009a -DIST nltk-wordnet31-20211023.zip 11055271 BLAKE2B e41a1951af5a71c9506d1e948b860574c94ab0ef31c1789a7e7bfb29c6dccea5b1d8895007631f7b595e9f90306365b5042e7a80dc6e1364fdbf4a5f0cba3b28 SHA512 a86091bd55e3a706892550b232be8f5199092623f1f8305d8c9be967a8527fe7d4ecb6250c369b229fdf52b6f3008106b758adc355fa2ad08b5b0cf2a458c173 +DIST nltk-wordnet2021-20211221.zip 11332750 BLAKE2B f4296728b803ef5a48cf312e2dd1471e8a240a0486e0c45b13c4c9998b8577ac30f87d151ffb2957e319e2b7b82706eba82b7b21530bfb3600d7cff2f4aefa0b SHA512 f27e847e0fd3fda83848af8bf94793d0e89c5f89b7868e222ba60be59122587f524e124e9d84f0b904b3a48808aae1c7d849f5851e2f3119ec99b3c30baac400 +DIST nltk-wordnet31-20211221.zip 11058667 BLAKE2B fb2dca62f602e05226dd62802f42e5f74d11cc535e71b1dc634a202a8c52a7912b1eb0b71edb3c564520c7c05251d9a9928abe88398b816a43035a6db4bda1b3 SHA512 dffea47ded52a042594dced94bc877686d78e6dc9d5a9db5f2a1718190f411cc991b04028d0a4407a8354afd695258e2a5ae881d5f6d9eaf0c3d30de1a8cbf96 DIST nltk-wordnet_ic-20200312.zip 12056682 BLAKE2B c2dc2a646015b23699a72f636b588ec5718c70e6941d9d56863257e1e0396c8cf59ac1dc6ed74e5d7f0c2ee9129d63221a03967bf66a3d335e99160f295ed44d SHA512 1c94451a13af6c76bff60a0cab2e70402a3d9abd2e8fc62a5473f24ab4229feb0afe4faa8d389734697a6cf86d2c8b1dc700bb3afa3cbc279b75d7e0ec19fc6d DIST nltk-words-20200312.zip 757777 BLAKE2B eaaaaab6c26e206e9b6ce45daf779e3cc6706a06132afeabf013026d0009caee2d678f3c4ea9125b9654f7143bef29ec7a5706b79e5650ea556c6821b7754e6f SHA512 2810f05d3fc7ee6b6f8636fa1ff7b4e8c8cdac12b415cc54d15c69102290122ea138ec4fa36cb483f790c1ac10b0f83ae4c2c3e0e8df7e67e90e962ee5dbb0be DIST nltk-ycoe-20200312.zip 477 BLAKE2B 574835aa011a06a06363e26facd6a6f583a1dc1cac2de39adff59d8ab48eefac030b43d935a2f79af855259f2a9a571193dae2811589483af97406ff05c76c9e SHA512 e39ce165074d10ff63cb84ea52905d7ecb937797c8123ed113c5609afe1f63ac44d04d48a681002c4eac21dc9076ac74164b886c6f9ce42f3a102c38d1e8e756 -EBUILD nltk-data-20211023.ebuild 4181 BLAKE2B 82e9f7818fc387f69b582ed62db6fa0679b25471ec579b442022f392222d15aa1bd7e9d9b4135cd50a9288c3116e2f26244d25ae3d22e635c66f32b6b259b606 SHA512 43b5bc6eca64fe86cfd3ca49cbd7db7a0421d061847fc4615c8767bd6ff1ee4939f8608370ad8fd8fa914f38ce68041ea2be1f133234ec9bd6cbfa1ee1fcc525 +EBUILD nltk-data-20211221.ebuild 4352 BLAKE2B 9789b9fe876cde0f6f651973190143c58a548ef08c05b0ee8166b1abaebdfd7a824bf14daf53c33fed9d280cc2b56542e350fabd49ab8c284f4c8ebb62065335 SHA512 34ab3931bd40c687a317bae2b98b0f53418694140b5b06a70e6b1ec7bc92f2c10ae47d15ad96a722ef8cccbaf903f930df50b358c44e8cafc64dc8f845312ee4 MISC metadata.xml 390 BLAKE2B e8f39395d8770de8e0e4c13fd51641c4f4a33935dc9e266a899d50cf9f42780e7682177a81f8902b20255114696d790e1c8aae8fdacd25afe8e6057d68d1b554 SHA512 6f173c8a058d6ae48c8316e00bfcd94e7f297667f729ebdab16733e6ae60ca43918e3e5f992faec07ab53c9682293f4543dbaf06045dfd6ff76cbedc8271afdd diff --git a/dev-python/nltk-data/nltk-data-20211023.ebuild b/dev-python/nltk-data/nltk-data-20211023.ebuild deleted file mode 100644 index df8437c785be..000000000000 --- a/dev-python/nltk-data/nltk-data-20211023.ebuild +++ /dev/null @@ -1,203 +0,0 @@ -# Copyright 2020-2021 Gentoo Authors -# Distributed under the terms of the GNU General Public License v2 - -EAPI=7 - -inherit check-reqs - -DESCRIPTION="Data files for NLTK" -HOMEPAGE="https://www.nltk.org/nltk_data/" - -# at least some of the files have poorly documented licenses -# TODO: create a USE flag for free-ish subset -LICENSE="all-rights-reserved" -SLOT="0" -KEYWORDS="amd64 x86" -IUSE="extra" -RESTRICT="bindist mirror" - -BDEPEND="app-arch/unzip" - -PACKAGES_ZIP_2020=( - # wget -O - https://www.nltk.org/nltk_data/ | xml sel -t -m '//package[@unzip=0]' -v @subdir -o "/" -v @id -n - | sort - corpora/comtrans - corpora/conll2007 - corpora/jeita - corpora/knbc - corpora/machado - corpora/masc_tagged - corpora/nombank.1.0 - corpora/panlex_swadesh - corpora/propbank - corpora/reuters - corpora/semcor - corpora/universal_treebanks_v20 - sentiment/vader_lexicon - stemmers/snowball_data -) - -PACKAGES_UNPACK_2020=( - # wget -O - https://www.nltk.org/nltk_data/ | xml sel -t -m '//package[@unzip=1]' -v @subdir -o "/" -v @id -n - | sort - corpora/abc - corpora/alpino - corpora/brown - corpora/cess_cat - corpora/cess_esp - corpora/chat80 - corpora/city_database - corpora/cmudict - corpora/comparative_sentences - corpora/conll2000 - corpora/conll2002 - corpora/crubadan - corpora/dependency_treebank - corpora/dolch - corpora/europarl_raw - corpora/floresta - corpora/framenet_v15 - corpora/framenet_v17 - corpora/gazetteers - corpora/genesis - corpora/gutenberg - corpora/ieer - corpora/inaugural - corpora/indian - corpora/lin_thesaurus - corpora/mac_morpho - corpora/movie_reviews - corpora/mte_teip5 - corpora/names - corpora/nonbreaking_prefixes - corpora/nps_chat - corpora/omw - corpora/opinion_lexicon - corpora/pl196x - corpora/ppattach - corpora/product_reviews_1 - corpora/product_reviews_2 - corpora/pros_cons - corpora/ptb - corpora/qc - corpora/rte - corpora/senseval - corpora/sentence_polarity - corpora/sentiwordnet - corpora/shakespeare - corpora/sinica_treebank - corpora/state_union - corpora/subjectivity - corpora/swadesh - corpora/switchboard - corpora/timit - corpora/toolbox - corpora/treebank - corpora/twitter_samples - corpora/udhr - corpora/udhr2 - corpora/verbnet - corpora/webtext - corpora/wordnet - corpora/wordnet_ic - corpora/words - grammars/book_grammars - grammars/large_grammars - grammars/sample_grammars - misc/perluniprops - models/bllip_wsj_no_aux - models/moses_sample - models/wmt15_eval - models/word2vec_sample - stemmers/porter_test - stemmers/rslp - taggers/averaged_perceptron_tagger - taggers/averaged_perceptron_tagger_ru - taggers/universal_tagset - tokenizers/punkt -) - -PACKAGES_UNPACK_2021=( - corpora/stopwords - corpora/wordnet31 -) - -PACKAGES_UNPACK_EXTRA_2020=( - chunkers/maxent_ne_chunker - corpora/biocreative_ppi - corpora/brown_tei - corpora/kimmo - corpora/paradigms - corpora/pe08 - corpora/pil - corpora/problem_reports - corpora/smultron - corpora/unicode_samples - corpora/verbnet3 - corpora/ycoe - grammars/basque_grammars - grammars/spanish_grammars - help/tagsets - misc/mwa_ppdb - taggers/maxent_treebank_pos_tagger -) - -add_data() { - local x version=${1} - shift - - for x; do - SRC_URI+=" - https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/${x}.zip - -> nltk-${x#*/}-${version}.zip" - done -} - -add_data 20200312 "${PACKAGES_ZIP_2020[@]}" "${PACKAGES_UNPACK_2020[@]}" -add_data 20211023 "${PACKAGES_UNPACK_2021[@]}" -SRC_URI+=" - extra? (" -add_data 20200312 "${PACKAGES_UNPACK_EXTRA_2020[@]}" -SRC_URI+=" - )" - -CHECKREQS_DISK_USR=3G -CHECKREQS_DISK_BUILD=${CHECKREQS_DISK_USR} - -unpack_data() { - local x version=${1} - shift - - for x; do - local cat=${x%/*} - local pkg=${x#*/} - - mkdir -p "${S}/${cat}" || die - cd "${S}/${cat}" || die - unpack "nltk-${pkg}-${version}.zip" - done -} - -src_unpack() { - unpack_data 20200312 "${PACKAGES_UNPACK_2020[@]}" - unpack_data 20211023 "${PACKAGES_UNPACK_2021[@]}" - use extra && unpack_data 20200312 "${PACKAGES_UNPACK_EXTRA_2020[@]}" -} - -install_zips() { - local x version=${1} - shift - - for x; do - local cat=${x%/*} - local pkg=${x#*/} - - insinto "/usr/share/nltk_data/${cat}" - newins "${DISTDIR}/nltk-${pkg}-${version}.zip" "${pkg}.zip" - done -} - -src_install() { - dodir /usr/share/nltk_data - mv * "${ED}/usr/share/nltk_data/" || die - - install_zips 20200312 "${PACKAGES_ZIP_2020[@]}" -} diff --git a/dev-python/nltk-data/nltk-data-20211221.ebuild b/dev-python/nltk-data/nltk-data-20211221.ebuild new file mode 100644 index 000000000000..104917b4e5a0 --- /dev/null +++ b/dev-python/nltk-data/nltk-data-20211221.ebuild @@ -0,0 +1,210 @@ +# Copyright 2020-2021 Gentoo Authors +# Distributed under the terms of the GNU General Public License v2 + +EAPI=8 + +inherit check-reqs + +DESCRIPTION="Data files for NLTK" +HOMEPAGE="https://www.nltk.org/nltk_data/" + +# at least some of the files have poorly documented licenses +# TODO: create a USE flag for free-ish subset +LICENSE="all-rights-reserved" +SLOT="0" +KEYWORDS="amd64 x86" +IUSE="extra" +RESTRICT="bindist mirror" + +BDEPEND="app-arch/unzip" + +PACKAGES_ZIP_2020=( + # wget -O - https://www.nltk.org/nltk_data/ | xml sel -t -m '//package[@unzip=0]' -v @subdir -o "/" -v @id -n - | sort + corpora/comtrans + corpora/conll2007 + corpora/jeita + corpora/knbc + corpora/machado + corpora/masc_tagged + corpora/nombank.1.0 + corpora/panlex_swadesh + corpora/propbank + corpora/reuters + corpora/semcor + corpora/universal_treebanks_v20 + sentiment/vader_lexicon + stemmers/snowball_data +) + +PACKAGES_UNPACK_2020=( + # wget -O - https://www.nltk.org/nltk_data/ | xml sel -t -m '//package[@unzip=1]' -v @subdir -o "/" -v @id -n - | sort + corpora/abc + corpora/alpino + corpora/brown + corpora/cess_cat + corpora/cess_esp + corpora/chat80 + corpora/city_database + corpora/cmudict + corpora/comparative_sentences + corpora/conll2000 + corpora/conll2002 + corpora/crubadan + corpora/dependency_treebank + corpora/dolch + corpora/europarl_raw + corpora/floresta + corpora/framenet_v15 + corpora/framenet_v17 + corpora/gazetteers + corpora/genesis + corpora/gutenberg + corpora/ieer + corpora/indian + corpora/lin_thesaurus + corpora/mac_morpho + corpora/movie_reviews + corpora/mte_teip5 + corpora/names + corpora/nonbreaking_prefixes + corpora/nps_chat + corpora/omw + corpora/opinion_lexicon + corpora/pl196x + corpora/ppattach + corpora/product_reviews_1 + corpora/product_reviews_2 + corpora/pros_cons + corpora/ptb + corpora/qc + corpora/rte + corpora/senseval + corpora/sentence_polarity + corpora/sentiwordnet + corpora/shakespeare + corpora/state_union + corpora/subjectivity + corpora/swadesh + corpora/switchboard + corpora/timit + corpora/toolbox + corpora/treebank + corpora/twitter_samples + corpora/udhr + corpora/udhr2 + corpora/verbnet + corpora/webtext + corpora/wordnet + corpora/wordnet_ic + corpora/words + grammars/book_grammars + grammars/large_grammars + grammars/sample_grammars + misc/perluniprops + models/bllip_wsj_no_aux + models/moses_sample + models/wmt15_eval + models/word2vec_sample + stemmers/porter_test + stemmers/rslp + taggers/averaged_perceptron_tagger + taggers/averaged_perceptron_tagger_ru + taggers/universal_tagset + tokenizers/punkt +) + +PACKAGES_UNPACK_2021=( + corpora/stopwords +) + +PACKAGES_UNPACK_2021_12=( + corpora/inaugural + corpora/omw-1.4 + corpora/sinica_treebank + corpora/wordnet2021 + corpora/wordnet31 +) + +PACKAGES_UNPACK_EXTRA_2020=( + chunkers/maxent_ne_chunker + corpora/biocreative_ppi + corpora/brown_tei + corpora/kimmo + corpora/paradigms + corpora/pe08 + corpora/pil + corpora/problem_reports + corpora/smultron + corpora/unicode_samples + corpora/verbnet3 + corpora/ycoe + grammars/basque_grammars + grammars/spanish_grammars + help/tagsets + misc/mwa_ppdb + taggers/maxent_treebank_pos_tagger +) + +add_data() { + local x version=${1} + shift + + for x; do + SRC_URI+=" + https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/${x}.zip + -> nltk-${x#*/}-${version}.zip" + done +} + +add_data 20200312 "${PACKAGES_ZIP_2020[@]}" "${PACKAGES_UNPACK_2020[@]}" +add_data 20211023 "${PACKAGES_UNPACK_2021[@]}" +add_data 20211221 "${PACKAGES_UNPACK_2021_12[@]}" +SRC_URI+=" + extra? (" +add_data 20200312 "${PACKAGES_UNPACK_EXTRA_2020[@]}" +SRC_URI+=" + )" + +CHECKREQS_DISK_USR=3G +CHECKREQS_DISK_BUILD=${CHECKREQS_DISK_USR} + +unpack_data() { + local x version=${1} + shift + + for x; do + local cat=${x%/*} + local pkg=${x#*/} + + mkdir -p "${S}/${cat}" || die + cd "${S}/${cat}" || die + unpack "nltk-${pkg}-${version}.zip" + done +} + +src_unpack() { + unpack_data 20200312 "${PACKAGES_UNPACK_2020[@]}" + unpack_data 20211023 "${PACKAGES_UNPACK_2021[@]}" + unpack_data 20211221 "${PACKAGES_UNPACK_2021_12[@]}" + use extra && unpack_data 20200312 "${PACKAGES_UNPACK_EXTRA_2020[@]}" +} + +install_zips() { + local x version=${1} + shift + + for x; do + local cat=${x%/*} + local pkg=${x#*/} + + insinto "/usr/share/nltk_data/${cat}" + newins "${DISTDIR}/nltk-${pkg}-${version}.zip" "${pkg}.zip" + done +} + +src_install() { + dodir /usr/share/nltk_data + mv * "${ED}/usr/share/nltk_data/" || die + + install_zips 20200312 "${PACKAGES_ZIP_2020[@]}" +} -- cgit v1.2.3