added column counts to tsvsummary

2017-04-12 13:37:55 +03:00
parent 5e684b48c5
commit b43bc796fd
1 changed files with 41 additions and 26 deletions
--- a/tsv/tsvsummary
+++ b/tsv/tsvsummary
@@ -5,21 +5,25 @@ function usage {
 echo -e ' tsvsummary, a CSVSummary clone for the command line. 
 Depends on csvkit and ncsv
-Usage:   tsvsummary [-c column] [-t type] tsv_file
+Usage:  tsvsummary [-c column] [ -n name ] [-t type] tsv_file
-   -c    name of the column to use as clusterCol 
+   -c   name of the column to use as clusterCol 
-         (produce as many lines as column has unique values)
+        (produce as many lines as column has unique values)
-   -t    Type of statistics: mean, max, min, sum, stdev, nas, unique
+   -n   Name of column to print an element count
-   -h    Help
+   -t   Type of statistics: mean, max, min, sum, stdev, nas, unique
   -h   Help
 '
 }
 TYPEOF=mean
 TYPELIST=( mean max min sum stdev nas unique )
-while getopts c:ht: opt
+while getopts c:hn:t: opt
 do  case "$opt" in
    c)
        LABEL=$OPTARG
    ;;
    n)
        NAMECOUNT=$OPTARG
    ;;
    t)
        TYPEOF=$( echo $OPTARG | tr '[:upper:]' '[:lower:]' )
    ;;
@@ -34,31 +38,42 @@ do  case "$opt" in
 done
 shift $(( ${OPTIND} - 1 ));
-for type in "${TYPELIST[@]}"
+for type in "${TYPELIST[@]}"; do 
-do if [ "$type" = "$TYPEOF" ]; then TYPEFOUND=1;fi
+   if [ "$type" = "$TYPEOF" ]; then TYPEFOUND=1;fi
 done
 [[ -z "$TYPEFOUND" ]] && ( echo Statistics type $TYPEOF not recognized, valid ones: ${TYPELIST[@]}; exit 1 )
-if [ -z "$1" ]
+if [ -z "$1" ]; then
-then usage
+    usage
-     echo No file name provided
+    echo No file name provided
-     exit 1
+    exit 1
 fi
-which ncsv &> /dev/null || ( echo ncsv required: "https://bitbucket.org/MoonQ/ncsv"; exit 1 )
+which ncsv &> /dev/null || ( echo ncsv required: "https://bitbucket.org/MoonQ/ncsv (install with pip)"; exit 1 )
-which csvcut &> /dev/null || ( echo csvkit required: "https://csvkit.readthedocs.org/"; exit 1 )
+which csvcut &> /dev/null || ( echo csvkit required: "https://csvkit.readthedocs.org/ (install with pip)"; exit 1 )
-if [ -z "$LABEL" ]
+if [ -z "$LABEL" ]; then
-then ncsv --stat "$1" | sed -n "1p;/^$TYPEOF/Ip"
+    if [ -n "$NAMECOUNT" ]; then 
-     exit
+        paste <( ncsv --stat "$1" | sed -n "1p;/^$TYPEOF/Ip" ) <( echo "$NAMECOUNT"; cat "$1" | wc -l )
    else
        ncsv --stat "$1" | sed -n "1p;/^$TYPEOF/Ip"
    fi
    exit
 else
-     IFS=$'\n'
+    IFS=$'\n'
-     UNIQUE=( $( csvcut -t -c "$LABEL" "$1" | tail -n +2 | sort -uV ) )
+    UNIQUE=( $( csvcut -t -c "$LABEL" "$1" | tail -n +2 | sort -uV ) )
-     HEADER=$( csvcut -t -C "$LABEL" "$1" | head -n 1 | ncsv -i, -d"\t" )
+    HEADER=$( csvcut -t -C "$LABEL" "$1" | head -n 1 | ncsv -i, -d"\t" )
-     echo -e "$LABEL\tStatistic\t$HEADER"
+    if [ -n "$NAMECOUNT" ]; then 
-     
+        HEADER+="\t$NAMECOUNT"
-     for (( i=0; i<${#UNIQUE[@]}; i++ ))
+    fi
-     do  echo -ne "${UNIQUE[$i]}\t"
+    echo -e "$LABEL\tStatistic\t$HEADER"
-         csvgrep -t -c "$LABEL" -m "${UNIQUE[$i]}" "$1" | csvcut -C "$LABEL" | ncsv -i, --stat | sed -n "/^$TYPEOF/Ip"
+    
-     done
+    for (( i=0; i<${#UNIQUE[@]}; i++ )); do
        echo -ne "${UNIQUE[$i]}\t"
        csvgrep -t -c "$LABEL" -m "${UNIQUE[$i]}" "$1" | csvcut -C "$LABEL" | ncsv -i, --stat | sed -n "/^$TYPEOF/Ip" | tr -d \\n
        if [ -n "$NAMECOUNT" ]; then 
            printf "\t%s" $( csvgrep -t -c "$LABEL" -m "${UNIQUE[$i]}" "$1" | csvcut -C "$LABEL" | wc -l )
        fi
        printf "\n"
    done
 fi