added column counts to tsvsummary

This commit is contained in:
Ville Rantanen
2017-04-12 13:37:55 +03:00
parent 5e684b48c5
commit b43bc796fd

View File

@@ -5,21 +5,25 @@ function usage {
echo -e ' tsvsummary, a CSVSummary clone for the command line. echo -e ' tsvsummary, a CSVSummary clone for the command line.
Depends on csvkit and ncsv Depends on csvkit and ncsv
Usage: tsvsummary [-c column] [-t type] tsv_file Usage: tsvsummary [-c column] [ -n name ] [-t type] tsv_file
-c name of the column to use as clusterCol -c name of the column to use as clusterCol
(produce as many lines as column has unique values) (produce as many lines as column has unique values)
-t Type of statistics: mean, max, min, sum, stdev, nas, unique -n Name of column to print an element count
-h Help -t Type of statistics: mean, max, min, sum, stdev, nas, unique
-h Help
' '
} }
TYPEOF=mean TYPEOF=mean
TYPELIST=( mean max min sum stdev nas unique ) TYPELIST=( mean max min sum stdev nas unique )
while getopts c:ht: opt while getopts c:hn:t: opt
do case "$opt" in do case "$opt" in
c) c)
LABEL=$OPTARG LABEL=$OPTARG
;; ;;
n)
NAMECOUNT=$OPTARG
;;
t) t)
TYPEOF=$( echo $OPTARG | tr '[:upper:]' '[:lower:]' ) TYPEOF=$( echo $OPTARG | tr '[:upper:]' '[:lower:]' )
;; ;;
@@ -34,31 +38,42 @@ do case "$opt" in
done done
shift $(( ${OPTIND} - 1 )); shift $(( ${OPTIND} - 1 ));
for type in "${TYPELIST[@]}" for type in "${TYPELIST[@]}"; do
do if [ "$type" = "$TYPEOF" ]; then TYPEFOUND=1;fi if [ "$type" = "$TYPEOF" ]; then TYPEFOUND=1;fi
done done
[[ -z "$TYPEFOUND" ]] && ( echo Statistics type $TYPEOF not recognized, valid ones: ${TYPELIST[@]}; exit 1 ) [[ -z "$TYPEFOUND" ]] && ( echo Statistics type $TYPEOF not recognized, valid ones: ${TYPELIST[@]}; exit 1 )
if [ -z "$1" ] if [ -z "$1" ]; then
then usage usage
echo No file name provided echo No file name provided
exit 1 exit 1
fi fi
which ncsv &> /dev/null || ( echo ncsv required: "https://bitbucket.org/MoonQ/ncsv"; exit 1 ) which ncsv &> /dev/null || ( echo ncsv required: "https://bitbucket.org/MoonQ/ncsv (install with pip)"; exit 1 )
which csvcut &> /dev/null || ( echo csvkit required: "https://csvkit.readthedocs.org/"; exit 1 ) which csvcut &> /dev/null || ( echo csvkit required: "https://csvkit.readthedocs.org/ (install with pip)"; exit 1 )
if [ -z "$LABEL" ] if [ -z "$LABEL" ]; then
then ncsv --stat "$1" | sed -n "1p;/^$TYPEOF/Ip" if [ -n "$NAMECOUNT" ]; then
exit paste <( ncsv --stat "$1" | sed -n "1p;/^$TYPEOF/Ip" ) <( echo "$NAMECOUNT"; cat "$1" | wc -l )
else
ncsv --stat "$1" | sed -n "1p;/^$TYPEOF/Ip"
fi
exit
else else
IFS=$'\n' IFS=$'\n'
UNIQUE=( $( csvcut -t -c "$LABEL" "$1" | tail -n +2 | sort -uV ) ) UNIQUE=( $( csvcut -t -c "$LABEL" "$1" | tail -n +2 | sort -uV ) )
HEADER=$( csvcut -t -C "$LABEL" "$1" | head -n 1 | ncsv -i, -d"\t" ) HEADER=$( csvcut -t -C "$LABEL" "$1" | head -n 1 | ncsv -i, -d"\t" )
echo -e "$LABEL\tStatistic\t$HEADER" if [ -n "$NAMECOUNT" ]; then
HEADER+="\t$NAMECOUNT"
for (( i=0; i<${#UNIQUE[@]}; i++ )) fi
do echo -ne "${UNIQUE[$i]}\t" echo -e "$LABEL\tStatistic\t$HEADER"
csvgrep -t -c "$LABEL" -m "${UNIQUE[$i]}" "$1" | csvcut -C "$LABEL" | ncsv -i, --stat | sed -n "/^$TYPEOF/Ip"
done for (( i=0; i<${#UNIQUE[@]}; i++ )); do
echo -ne "${UNIQUE[$i]}\t"
csvgrep -t -c "$LABEL" -m "${UNIQUE[$i]}" "$1" | csvcut -C "$LABEL" | ncsv -i, --stat | sed -n "/^$TYPEOF/Ip" | tr -d \\n
if [ -n "$NAMECOUNT" ]; then
printf "\t%s" $( csvgrep -t -c "$LABEL" -m "${UNIQUE[$i]}" "$1" | csvcut -C "$LABEL" | wc -l )
fi
printf "\n"
done
fi fi