@@ -0,0 +1,47 @@
+# Copyright 1999-2017 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
+inherit flag-o-matic toolchain-funcs
+DESCRIPTION="Clustering Database at High Identity with Tolerance"
+SRC_URI="${PV}/${RELEASE}.tar.gz -> ${P}.tar.gz"
+KEYWORDS="~amd64 ~x86 ~amd64-linux ~x86-linux"
+ "${FILESDIR}"/${PN}-4.6.6-fix-perl-shebangs.patch
+ "${FILESDIR}"/${PN}-4.6.6-fix-build-system.patch
+pkg_pretend() {
+ [[ ${MERGE_TYPE} != binary ]] && use openmp && tc-check-openmp
+pkg_setup() {
+ [[ ${MERGE_TYPE} != binary ]] && use openmp && tc-check-openmp
+src_compile() {
+ tc-export CXX
+ emake openmp=$(usex openmp)
+src_install() {
+ dodir /usr/bin
+ PREFIX="${EPREFIX}"/usr/bin default
+ dodoc doc/*.pdf
diff --git a/sci-biology/cd-hit/files/cd-hit-4.6.6-fix-build-system.patch b/sci-biology/cd-hit/files/cd-hit-4.6.6-fix-build-system.patch
new file mode 100644
index 000000000000..c668d5c6154e
--- /dev/null
+++ b/sci-biology/cd-hit/files/cd-hit-4.6.6-fix-build-system.patch
@@ -0,0 +1,122 @@
+Fix build system, in order to honour user variables
+--- a/makefile
++++ b/makefile
+@@ -1,7 +1,4 @@
+-CC = g++ -Wall -ggdb
+-CC = g++ -pg
+-CC = g++
++CXX ?= g++
+ # without OpenMP
+@@ -9,35 +6,19 @@
+ # in command line:
+ # make openmp=yes
+ ifeq ($(openmp),no)
+- CCFLAGS = -fopenmp
+-# support debugging
+-# in command line:
+-# make debug=yes
+-# make openmp=yes debug=yes
+-ifeq ($(debug),yes)
+-CCFLAGS += -ggdb
+ else
+-CCFLAGS += -O2
++ my_CXXFLAGS = -fopenmp
+ endif
+ ifdef MAX_SEQ
+ endif
+-#LDFLAGS = -static -o
+-LDFLAGS += -o
+ PROGS = cd-hit cd-hit-est cd-hit-2d cd-hit-est-2d cd-hit-div cd-hit-454
+-# Propagate hardening flags
+ .c++.o:
+- $(CC) $(CCFLAGS) -c $<
+ all: $(PROGS)
+@@ -47,52 +28,52 @@
+ # programs
+ cd-hit: cdhit-common.o cdhit-utility.o cdhit.o
+- $(CC) $(CCFLAGS) cdhit.o cdhit-common.o cdhit-utility.o $(LDFLAGS) cd-hit
++ $(CXX) $(my_CXXFLAGS) $(CXXFLAGS) $(LDFLAGS) cdhit.o cdhit-common.o cdhit-utility.o -o cd-hit
+ cd-hit-2d: cdhit-common.o cdhit-utility.o cdhit-2d.o
+- $(CC) $(CCFLAGS) cdhit-2d.o cdhit-common.o cdhit-utility.o $(LDFLAGS) cd-hit-2d
++ $(CXX) $(my_CXXFLAGS) $(CXXFLAGS) $(LDFLAGS) cdhit-2d.o cdhit-common.o cdhit-utility.o -o cd-hit-2d
+ cd-hit-est: cdhit-common.o cdhit-utility.o cdhit-est.o
+- $(CC) $(CCFLAGS) cdhit-est.o cdhit-common.o cdhit-utility.o $(LDFLAGS) cd-hit-est
++ $(CXX) $(my_CXXFLAGS) $(CXXFLAGS) $(LDFLAGS) cdhit-est.o cdhit-common.o cdhit-utility.o -o cd-hit-est
+ cd-hit-est-2d: cdhit-common.o cdhit-utility.o cdhit-est-2d.o
+- $(CC) $(CCFLAGS) cdhit-est-2d.o cdhit-common.o cdhit-utility.o $(LDFLAGS) cd-hit-est-2d
++ $(CXX) $(my_CXXFLAGS) $(CXXFLAGS) $(LDFLAGS) cdhit-est-2d.o cdhit-common.o cdhit-utility.o -o cd-hit-est-2d
+ cd-hit-div: cdhit-common.o cdhit-utility.o cdhit-div.o
+- $(CC) $(CCFLAGS) cdhit-div.o cdhit-common.o cdhit-utility.o $(LDFLAGS) cd-hit-div
++ $(CXX) $(my_CXXFLAGS) $(CXXFLAGS) $(LDFLAGS) cdhit-div.o cdhit-common.o cdhit-utility.o -o cd-hit-div
+ cd-hit-454: cdhit-common.o cdhit-utility.o cdhit-454.o
+- $(CC) $(CCFLAGS) cdhit-454.o cdhit-common.o cdhit-utility.o $(LDFLAGS) cd-hit-454
++ $(CXX) $(my_CXXFLAGS) $(CXXFLAGS) $(LDFLAGS) cdhit-454.o cdhit-common.o cdhit-utility.o -o cd-hit-454
+ # objects
+ cdhit-common.o: cdhit-common.c++ cdhit-common.h
+- $(CC) $(CCFLAGS) cdhit-common.c++ -c
++ $(CXX) $(my_CXXFLAGS) $(CXXFLAGS) $(my_CPPFLAGS) $(CPPFLAGS) cdhit-common.c++ -c
+ cdhit-utility.o: cdhit-utility.c++ cdhit-utility.h
+- $(CC) $(CCFLAGS) cdhit-utility.c++ -c
++ $(CXX) $(my_CXXFLAGS) $(CXXFLAGS) $(my_CPPFLAGS) $(CPPFLAGS) cdhit-utility.c++ -c
+ cdhit.o: cdhit.c++ cdhit-utility.h
+- $(CC) $(CCFLAGS) cdhit.c++ -c
++ $(CXX) $(my_CXXFLAGS) $(CXXFLAGS) $(my_CPPFLAGS) $(CPPFLAGS) cdhit.c++ -c
+ cdhit-2d.o: cdhit-2d.c++ cdhit-utility.h
+- $(CC) $(CCFLAGS) cdhit-2d.c++ -c
++ $(CXX) $(my_CXXFLAGS) $(CXXFLAGS) $(my_CPPFLAGS) $(CPPFLAGS) cdhit-2d.c++ -c
+ cdhit-est.o: cdhit-est.c++ cdhit-utility.h
+- $(CC) $(CCFLAGS) cdhit-est.c++ -c
++ $(CXX) $(my_CXXFLAGS) $(CXXFLAGS) $(my_CPPFLAGS) $(CPPFLAGS) cdhit-est.c++ -c
+ cdhit-est-2d.o: cdhit-est-2d.c++ cdhit-utility.h
+- $(CC) $(CCFLAGS) cdhit-est-2d.c++ -c
++ $(CXX) $(my_CXXFLAGS) $(CXXFLAGS) $(my_CPPFLAGS) $(CPPFLAGS) cdhit-est-2d.c++ -c
+ cdhit-div.o: cdhit-div.c++ cdhit-common.h
+- $(CC) $(CCFLAGS) cdhit-div.c++ -c
++ $(CXX) $(my_CXXFLAGS) $(CXXFLAGS) $(my_CPPFLAGS) $(CPPFLAGS) cdhit-div.c++ -c
+ cdhit-454.o: cdhit-454.c++ cdhit-common.h
+- $(CC) $(CCFLAGS) cdhit-454.c++ -c
++ $(CXX) $(my_CXXFLAGS) $(CXXFLAGS) $(my_CPPFLAGS) $(CPPFLAGS) cdhit-454.c++ -c
+ PREFIX ?= /usr/local/bin
+ install:
+ for prog in $(PROGS); do \
+- install -m 0755 $$prog $(PREFIX); \
++ install -m 0755 $$prog $(DESTDIR)$(PREFIX); \
+ done
+- install -m 0755 *.pl $(PREFIX);
++ install -m 0755 *.pl $(DESTDIR)$(PREFIX);
diff --git a/sci-biology/cd-hit/files/cd-hit-4.6.6-fix-perl-shebangs.patch b/sci-biology/cd-hit/files/cd-hit-4.6.6-fix-perl-shebangs.patch
new file mode 100644
index 000000000000..3784296f2e94
--- /dev/null
+++ b/sci-biology/cd-hit/files/cd-hit-4.6.6-fix-perl-shebangs.patch
@@ -0,0 +1,219 @@
+Make perl shebangs more Prefix friendly
+See also:
+--- a/
++++ b/
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl -w
++#!/usr/bin/env perl
+ # =============================================================================
+ # CD-HIT
+ #
+--- a/
++++ b/
+@@ -1,4 +1,4 @@
++#!/usr/bin/env perl
+ #not like cd-hit-div, this script do not sort input
+ #or throw away seq
+--- a/
++++ b/
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl -w
++#!/usr/bin/env perl
+ # =============================================================================
+ # CD-HIT
+ #
+--- a/
++++ b/
+@@ -1,4 +1,4 @@
++#!/usr/bin/env perl
+ $clstr = shift;
+ $fr = shift; # for nr80.clstr $fr = 0.8
+--- a/
++++ b/
+@@ -1,4 +1,4 @@
++#!/usr/bin/env perl
+ my $no = 0;
+ my $clstr_no = "";
+--- a/
++++ b/
+@@ -1,4 +1,4 @@
++#!/usr/bin/env perl
+ #usage: [-len|-size] level1.clstr [level2.clstr level3.clstr ...]
+ #purpose: to create xml file from cd-hit or hierarchical cd-hit(h-cd-hit) results
+--- a/
++++ b/
+@@ -1,4 +1,4 @@
++#!/usr/bin/env perl
+ #keep only top $no proteins in cluster
+--- a/
++++ b/
+@@ -1,4 +1,4 @@
++#!/usr/bin/env perl
+ # order of clusters don't need to be the same
+ # but then I have to read everything into memory
+--- a/
++++ b/
+@@ -1,4 +1,4 @@
++#!/usr/bin/env perl
+ # the order of clusters need to be identical
+ my ($master_clstr, @clstr) = @ARGV;
+--- a/
++++ b/
+@@ -1,4 +1,4 @@
++#!/usr/bin/env perl
+ ## calculate the sensitivity and specificity of clusters
+ ## if the input fasta file has pre-defined classification term
+--- a/
++++ b/
+@@ -1,4 +1,4 @@
++#!/usr/bin/env perl
+ ## calculate the sensitivity and specificity of clusters
+ ## if the input fasta file has pre-defined classification term
+--- a/
++++ b/
+@@ -1,4 +1,4 @@
++#!/usr/bin/env perl
+ $file90 = shift;
+--- a/
++++ b/
+@@ -1,4 +1,4 @@
++#!/usr/bin/env perl
+ $no = 0;
+ while($ll=<>){
+ if ($ll =~ /^>Cluster (\d+)/) {
+--- a/
++++ b/
+@@ -1,4 +1,4 @@
++#!/usr/bin/env perl
+ $rep = "";
+ $no = 0;
+--- a/
++++ b/
+@@ -1,4 +1,4 @@
++#!/usr/bin/env perl
+ # output single fasta file
+ # for each cluster output at least $cutoff seqs
+--- a/
++++ b/
+@@ -1,4 +1,4 @@
++#!/usr/bin/env perl
+ # if nr90 from nr100 and
+ # nr80 from nr90, so I have nr90.clstr and nr80.clstr
+ # but, in nr80.clstr, some gi numbers whose from nr100 are there
+--- a/
++++ b/
+@@ -1,4 +1,4 @@
++#!/usr/bin/env perl
+ #my $by = shift;
+ my $min;
+--- a/
++++ b/
+@@ -1,4 +1,4 @@
++#!/usr/bin/env perl
+ #my $by = shift;
+ my $min;
+--- a/
++++ b/
+@@ -1,4 +1,4 @@
++#!/usr/bin/env perl
+ if(@ARGV==0){
+ print "Usage:\n\ [-bin N] clstr_file\n";
+--- a/
++++ b/
+@@ -1,4 +1,4 @@
++#!/usr/bin/env perl
+ if(@ARGV==0){
+ print "Usage:\n\ clstr_file\n";
+--- a/
++++ b/
+@@ -1,4 +1,4 @@
++#!/usr/bin/env perl
+ my $sort_by_what = shift;
+ $sort_by_what = "no" unless $sort_by_what;
+--- a/
++++ b/
+@@ -1,4 +1,4 @@
++#!/usr/bin/env perl
+ my $sort_by = shift;
+ $sort_by = "len" unless ($sort_by);
+--- a/
++++ b/
+@@ -1,4 +1,4 @@
++#!/usr/bin/env perl
+ if(@ARGV==0){
+ print "Usage:\n\ clstr_file tbl_file\n";
+--- a/
++++ b/
+@@ -1,4 +1,4 @@
++#!/usr/bin/env perl
+ if(@ARGV==0){
+ print "Usage:\n\ table_file level\n";
+--- a/
++++ b/
+@@ -1,4 +1,4 @@
++#!/usr/bin/env perl
+ #note you have to use "-d 0" in the cd-hit run
+ #note you better to use "-g 1" in the cd-hit run
+--- a/
++++ b/
+@@ -1,4 +1,4 @@
++#!/usr/bin/env perl
+ use Image::Magick;
+--- a/
++++ b/
+@@ -1,4 +1,4 @@
++#!/usr/bin/env perl
+ $file90 = shift;
+ $segs = shift;
diff --git a/sci-biology/cd-hit/metadata.xml b/sci-biology/cd-hit/metadata.xml
new file mode 100644
index 000000000000..1f3b075216dd
--- /dev/null
+++ b/sci-biology/cd-hit/metadata.xml
@@ -0,0 +1,27 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<!DOCTYPE pkgmetadata SYSTEM "">
+ <maintainer type="project">
+ <email></email>
+ <name>Gentoo Biology Project</name>
+ </maintainer>
+ <longdescription>
+CD-HIT is a very widely used program for clustering and comparing large sets
+of protein or nucleotide sequences. CD-HIT is very fast and can handle
+extremely large databases. CD-HIT helps to significantly reduce the
+computational and manual efforts in many sequence analysis tasks and aids in
+understanding the data structure and correct the bias within a dataset.
+The CD-HIT package has CD-HIT, CD-HIT-2D, CD-HIT-EST, CD-HIT-EST-2D,
+CD-HIT-454, CD-HIT-PARA, PSI-CD-HIT and over a dozen scripts. CD-HIT
+(CD-HIT-EST) clusters similar proteins (DNAs) into clusters that meet a
+user-defined similarity threshold. CD-HIT-2D (CD-HIT-EST-2D) compares 2
+datasets and identifies the sequences in db2 that are similar to db1 above
+a threshold. CD-HIT-454 is a program to identify natural and artificial
+duplicates from pyrosequencing reads. The usage of other programs and
+scripts can be found in CD-HIT user's guide.
+ </longdescription>
+ <upstream>
+ <remote-id type="google-code">cdhit</remote-id>
+ <remote-id type="github">weizhongli/cdhit</remote-id>
+ </upstream>