#!/bin/bash

usage(){
echo "
Written by Brian Bushnell
Last modified February 22, 2025

Description:  Bins contigs using coverage and kmer frequencies.
If reads or covstats are provided, coverage will be calculated from those;
otherwise, it will be parsed from contig headers.  Coverage can be parsed
from Spades or Tadpole contig headers; alternatively, renamebymapping.sh
can be used to annotate the headers with coverage from multiple sam files.
Any number of sam files may be used (from different samples of the same
environment, usually).  The more sam files, the more accurate, though
some stringency (depthratio and maxcovariance) may need to be relaxed
with large numbers of sam files (more than 4).

Usage:  quickbin.sh in=contigs.fa out=bins/bin_%.fa
or
quickbin.sh in=contigs.fa out=bin_%.fa *.sam covout=cov.txt
or
quickbin.sh in=contigs.fa out=bin_%.fa cov=covfile.txt

File parameters:
in=<file>       Assembly input; only required parameter.
reads=<file>    Read input (fastq or sam).  Multiple sam files may be used,
                comma-delimited, or as arguments without 'readsin='.
cov=<file>      Cov file generated by Quickbin from sam files; can be used
                instead of sam files.
out=<pattern>   Output pattern.  If this contains a % symbol, like bin%.fa,
                one file will be created per bin.  If not, all contigs will
                be written to the same file, with the name modified to
                indicate their bin number.

Size parameters:
mincluster=50k  Minimum output cluster size in base pairs; smaller clusters
                will share a residual file.
mincontig=100   Don't load contigs smaller than this; reduces memory usage.
minseed=2000    Minimum initial cluster size; increasing this will increase
                speed dramatically for large metagenomes, reduce sensitivity
                for small contigs, and slightly reduce contamination.
                In particular, large metagenomes with only 1 sample will run
                slowly if this is below 2000; with at least 3 samples the
                speed should not be affected much.
minresidue=200  Discard unclustered contigs shorter than this; reduces memory.

Quantization parameters:
gcwidth=0.02    Width of GC matrix gridlines.  Smaller is faster.
depthwidth=0.5  Width of depth matrix gridlines.  Smaller is faster.  This
                is on a log2 scale so 0.5 would mean 2 gridlines per power
                of 2 depth; e.g. lines at 0.707, 1, 1.414, 2, 2.818, etc.
Note: Halving both quantization parameters can roughly quadruple speed,
but will decrease recovery of shorter contigs.

Edge-processing parameters:
e1=0                  Edge-first clustering passes; may increase speed
                      at the cost of purity.
e2=4                  Later edge-based clustering passes.
edgeStringency1=0.25  Stringency for edge-first clustering; 
                      lower is more stringent.
edgeStringency2=2     Stringency for later edge-based clustering.
maxEdges=4            Follow up to this many edges per contig.
minEdgeWeight=2       Ignore edges made from fewer read pairs.
minEdgeRatio=0.4      Ignore edges under this fraction of max edge weight.
minmapq=20            Ignore reads mapping with mapq below this for the
                      purpose of making edges.  They are still used for depth.
goodedgemult=1.4      Merge stringency multiplier for contigs joined by
                      an edge; lower is more stringent.

Merge stringency thresholds:
maxKmerDif2=0.005     Cluster refinement tetramer frequency cosine difference.
maxDepthRatio2=1.25   Cluster refinement relative depth ratio.
maxGCDif2=0.03        Cluster refinement GC content difference.
maxCovariance2=0.0002 Cluster refinement depth covariance cosine difference.
residuestringency=0.6 Increased stringency for contigs under minseed.
                      Lower than 1 is more strict. 

smallThresh=10000     Clusters smaller than this are considered small.
smallMult=2.2         Stringency multiplier for small clusters; higher is less
                      stringent.

Note that the above default thresholds are for a single sample 
(one sam file) and are automatically adjusted if there are more samples,
but will still be overridden if specified on the command line.  See
Binner.setSamples() in bbmap/current/bin/Binner.java for details.

Other parameters:
sketchoutput=f        Use SendSketch to identify taxonomy of output clusters.
validate=f            If contig headers have a term such as 'tid_1234', this
                      will be parsed and used to evaluate correctness.

Java Parameters:
-Xmx            This will set Java's memory usage, overriding autodetection.
                -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will
                specify 200 megs. The max is typically 85% of physical memory.
-eoom           This flag will cause the process to exit if an out-of-memory
                exception occurs.  Requires Java 8u92+.
-da             Disable assertions.

Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
"
}

#This block allows symlinked shellscripts to correctly set classpath.
pushd . > /dev/null
DIR="${BASH_SOURCE[0]}"
while [ -h "$DIR" ]; do
  cd "$(dirname "$DIR")"
  DIR="$(readlink "$(basename "$DIR")")"
done
cd "$(dirname "$DIR")"
DIR="$(pwd)/"
popd > /dev/null

#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
CP="$DIR""current/"

z="-Xmx4g"
z2="-Xms4g"
set=0

if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
	usage
	exit
fi

calcXmx () {
	source "$DIR""/calcmem.sh"
	setEnvironment
	parseXmx "$@"
	if [[ $set == 1 ]]; then
		return
	fi
	freeRam 4000m 84
	z="-Xmx${RAM}m"
	z2="-Xms${RAM}m"
}
calcXmx "$@"

quickbin() {
	local CMD="java $EA $EOOM $z -cp $CP bin.QuickBin $@"
	echo $CMD >&2
	eval $CMD
}

quickbin "$@"
