diff --git a/tsv/tsvsummary b/tsv/tsvsummary index 1f1e064..da6577c 100755 --- a/tsv/tsvsummary +++ b/tsv/tsvsummary @@ -5,21 +5,25 @@ function usage { echo -e ' tsvsummary, a CSVSummary clone for the command line. Depends on csvkit and ncsv -Usage: tsvsummary [-c column] [-t type] tsv_file - -c name of the column to use as clusterCol - (produce as many lines as column has unique values) - -t Type of statistics: mean, max, min, sum, stdev, nas, unique - -h Help +Usage: tsvsummary [-c column] [ -n name ] [-t type] tsv_file + -c name of the column to use as clusterCol + (produce as many lines as column has unique values) + -n Name of column to print an element count + -t Type of statistics: mean, max, min, sum, stdev, nas, unique + -h Help ' } TYPEOF=mean TYPELIST=( mean max min sum stdev nas unique ) -while getopts c:ht: opt +while getopts c:hn:t: opt do case "$opt" in c) LABEL=$OPTARG ;; + n) + NAMECOUNT=$OPTARG + ;; t) TYPEOF=$( echo $OPTARG | tr '[:upper:]' '[:lower:]' ) ;; @@ -34,31 +38,42 @@ do case "$opt" in done shift $(( ${OPTIND} - 1 )); -for type in "${TYPELIST[@]}" -do if [ "$type" = "$TYPEOF" ]; then TYPEFOUND=1;fi +for type in "${TYPELIST[@]}"; do + if [ "$type" = "$TYPEOF" ]; then TYPEFOUND=1;fi done [[ -z "$TYPEFOUND" ]] && ( echo Statistics type $TYPEOF not recognized, valid ones: ${TYPELIST[@]}; exit 1 ) -if [ -z "$1" ] -then usage - echo No file name provided - exit 1 +if [ -z "$1" ]; then + usage + echo No file name provided + exit 1 fi -which ncsv &> /dev/null || ( echo ncsv required: "https://bitbucket.org/MoonQ/ncsv"; exit 1 ) -which csvcut &> /dev/null || ( echo csvkit required: "https://csvkit.readthedocs.org/"; exit 1 ) +which ncsv &> /dev/null || ( echo ncsv required: "https://bitbucket.org/MoonQ/ncsv (install with pip)"; exit 1 ) +which csvcut &> /dev/null || ( echo csvkit required: "https://csvkit.readthedocs.org/ (install with pip)"; exit 1 ) -if [ -z "$LABEL" ] -then ncsv --stat "$1" | sed -n "1p;/^$TYPEOF/Ip" - exit +if [ -z "$LABEL" ]; then + if [ -n "$NAMECOUNT" ]; then + paste <( ncsv --stat "$1" | sed -n "1p;/^$TYPEOF/Ip" ) <( echo "$NAMECOUNT"; cat "$1" | wc -l ) + else + ncsv --stat "$1" | sed -n "1p;/^$TYPEOF/Ip" + fi + exit else - IFS=$'\n' - UNIQUE=( $( csvcut -t -c "$LABEL" "$1" | tail -n +2 | sort -uV ) ) - HEADER=$( csvcut -t -C "$LABEL" "$1" | head -n 1 | ncsv -i, -d"\t" ) - echo -e "$LABEL\tStatistic\t$HEADER" - - for (( i=0; i<${#UNIQUE[@]}; i++ )) - do echo -ne "${UNIQUE[$i]}\t" - csvgrep -t -c "$LABEL" -m "${UNIQUE[$i]}" "$1" | csvcut -C "$LABEL" | ncsv -i, --stat | sed -n "/^$TYPEOF/Ip" - done + IFS=$'\n' + UNIQUE=( $( csvcut -t -c "$LABEL" "$1" | tail -n +2 | sort -uV ) ) + HEADER=$( csvcut -t -C "$LABEL" "$1" | head -n 1 | ncsv -i, -d"\t" ) + if [ -n "$NAMECOUNT" ]; then + HEADER+="\t$NAMECOUNT" + fi + echo -e "$LABEL\tStatistic\t$HEADER" + + for (( i=0; i<${#UNIQUE[@]}; i++ )); do + echo -ne "${UNIQUE[$i]}\t" + csvgrep -t -c "$LABEL" -m "${UNIQUE[$i]}" "$1" | csvcut -C "$LABEL" | ncsv -i, --stat | sed -n "/^$TYPEOF/Ip" | tr -d \\n + if [ -n "$NAMECOUNT" ]; then + printf "\t%s" $( csvgrep -t -c "$LABEL" -m "${UNIQUE[$i]}" "$1" | csvcut -C "$LABEL" | wc -l ) + fi + printf "\n" + done fi