q-tools/tsv/tsvkit.sh

## Tab sebarated file functions, wrappers for csvkit
# To use the tsv- commands, install csvkit: <pre>sudo apt-get install python-pip; sudo pip install csvkit</pre>

TSVDIR=$( dirname "${BASH_SOURCE[0]}" )

function c2t {
# Convert comma separated stream in to tab separated stream
# Usage: echo "foo,bar" | c2t

    python3 -c 'import sys,csv
try:
    csv.writer(sys.stdout, dialect=csv.excel_tab, lineterminator="\n").writerows(csv.reader(sys.stdin, dialect=csv.excel))
except IOError:
    pass'
}

function header {
# Print only the first line of input
# Usage:  header file.csv
# Usage:  cat file.csv | header

    head -n 1 "$@"
}

function noheader {
# Strip first row of input
# Usage:  noheader file.csv
# Usage:  cat file.csv | noheader

    tail -n +2 "$@"
}

function tsvecho {
# Echo with tab separated values, quoted
# Usage: tsvecho value1 value2 "some value" > header.csv
# Usage: echo value1 value2 | tsvecho

    local HEAD
    [[ -t 0 ]] && {
        printf -v HEAD "\"%s\"\t" "$@"
    } || {
        printf -v HEAD "\"%s\"\t" "$@" $( cat - )
    }
    echo "${HEAD%?}"
}

function tsvstrip {
# Strip tsv of quotes
# Usage: cat file.csv | tsvstrip

    python3 -c 'import sys,csv
try:
    csv.writer(sys.stdout, dialect=csv.excel_tab, quoting=csv.QUOTE_NONE).writerows(csv.reader(sys.stdin, dialect=csv.excel_tab))
except IOError:
    pass'
}

function tsvtranspose {
# Transpose a tsv file
# Usage: cat file.csv | tsvtranspose

    python3 -c 'import sys,csv
try:
    csv.writer(sys.stdout, dialect=csv.excel_tab, quoting=csv.QUOTE_NONE).writerows(map(None,*csv.reader(sys.stdin, dialect=csv.excel_tab)))
except IOError:
    pass'
}

function tsvhead {
# Head a file, but print also the header. header not counted in line numbers
# Usage: cat file | tsvhead -n 30

    if [ -t 0 ]; then
        python3 "${TSVDIR}"/lib/tsvhead "$@"
    else
        cat - | python3 "${TSVDIR}"/lib/tsvhead "$@"
    fi
}

function tsvtail {
# Tail a file, but print also the header. header not counted in line numbers
# Usage: cat file | tsvtail -n 30

    if [ -t 0 ]; then
        python3 "${TSVDIR}"/lib/tsvtail "$@"
    else
        cat - | python3 "${TSVDIR}"/lib/tsvtail "$@"
    fi
}

which "csvjoin" > /dev/null && {

function tsvcut {
# csvcut with tab-delimited dialect, see original script for options
# Usage: tsvcut -c Col1,Col3 input1.tsv


    csvcut -t "$@" | c2t
}

function tsvformat {
# csvformat with tab-delimited dialect, see original script for options
# Usage: tsvformat -c Col2 -m searchString input1.tsv


    csvformat -t -T "$@"
}

function tsvgrep {
# csvgrep with tab-delimited dialect, see original script for options
# Usage: tsvgrep -c Col2 -m searchString input1.tsv


    csvgrep -t "$@" | c2t
}

function tsvjoin {
# csvjoin with tab-delimited dialect, see original script for options
# Usage: tsvjoin -c 1,1 input1.tsv input2.tsv


    csvjoin -t "$@" | c2t
}

function tsvlook {
# csvlook with tab-delimited dialect, see original script for options
# Usage: tsvlook file1.tsv


    csvlook -t "$@"
}

function tsvquery {
# Simple SQL query for tab-delimited files. The tables are  named tsv1,tsv2 ...,
# unless named with NAME=FILE syntax. If defining database with -d
# it is kept for later queries. A query is not optional(!), but it can be
# an empty string.
# Usage: tsvquery mytable=file1.tsv file2.tsv "SELECT * FROM tsv1,mytable WHERE C4 not like '%NA%' ORDER BY C4 DESC"
# Usage: tsquery -d DB.sqlite tsv1=file1.tsv "SELECT * FROM tsv1;" # Will keep the database file, reusable later

    local DBTEMP
    local i
    for (( i=1; i<=$(($#)); i++ ))
    do [ "${!i}" = "-h" ] && {
        echo 'Usage:
    tsvquery [-d database.sqlite] tableName=data.tsv otherTable=data2.tsv "SQL QUERY"
    When defining a database with -d it is kept, and can be
    inserted with more data later. Otherwise the DB is created in /tmp/
    and deleted afterwards.
    If not using name=data.tsv syntax, tables are named tsv1, tsv2...
    Note: You have to give an SQL query. If you just want to
          populate a database, add " "  as an empty query.
        '
        return 0
        }
       [ "${!i}" = "-d" ] && {
        # User defined data base
        local j
        j=$(( $i + 1 ))
        DBTEMP="${!j}"
        shift 2
       }
    done
    [ -z "$j" ] && DBTEMP=$( mktemp )
    for (( i=1; i<=$(($#-1)); i++ ))
    do [ -f "${!i}" ] && {
           # Add table with unique numbering
           local OLDTBLS=$( sqlite3 "$DBTEMP" ".tables" )
           local TBLNO=1
           while :
           do echo $OLDTBLS | grep tsv$TBLNO > /dev/null || break
              TBLNO=$(( $TBLNO + 1 ))
           done
           cat "${!i}" | csvsql -t --db "sqlite:///$DBTEMP" --insert --table tsv$TBLNO
       } || {
           # Add a user named table
           local TBL
           local FIL
           TBL=$( echo ${!i} | sed 's,=.*,,' )
           FIL=$( echo ${!i} | sed "s,^$TBL=,," )
           [ -f "$FIL" ] && {
               cat "$FIL" | csvsql -t --db "sqlite:///$DBTEMP" --insert --table "$TBL"
           } || {
               echo File "${!i}" not found
               rm -f "$DBTEMP"
               return 1
           }
       }
    done
    sqlite3 -list -separator '	' -nullvalue NA -header -batch "$DBTEMP" "${@: -1}"
    local EC=$?
    # remove DB if using temporary
    [ -z "$j" ] && {
        rm -f "$DBTEMP"
    }
    return $EC
}

function tsvsort {
# csvsort with tab-delimited dialect, see original script for options
# Usage: tsvsort -c Col3 input.tsv


    csvsort -t "$@" | c2t
}

function tsvstack {
# csvstack with tab-delimited dialect, see original script for options
# Usage: tsvstack file1.tsv file2.tsv


    csvstack -t "$@" | c2t
}

} || {
    CSVKITERROR="no csvkit installed.  [sudo pip install csvkit]"
    tsvjoin () { echo $CSVKITERROR; return 1; }
    tsvcut () { echo $CSVKITERROR; return 1; }
    tsvgrep () { echo $CSVKITERROR; return 1; }
    tsvquery () { echo $CSVKITERROR; return 1; }
    tsvsort () { echo $CSVKITERROR; return 1; }
    tsvstack () { echo $CSVKITERROR; return 1; }
    tsvlook () { echo $CSVKITERROR; return 1; }
}

function tsvfold {
# Folds a sequence of line separated arguments into a TSV table with a header you specify,
# which means it may easily cause aliasing if you have the wrong number of them.
# Any loop which uses "print" to produce field values should ideally work.
# Usage: cat data | tsv Column1 Column2 ...
# Usage: seq 100 | tsvfold First Second Third Fourth


    echo "" | tsvecho "$@"
    if ! [ -t 0 ]; then
        mawk -F $'\t' '{ORS=(NR%'$#'?FS:RS)}1'
    fi
}

function tsvdims {
# Print dimensions of a TSV
# Usage: tsvdims file.txt
# Usage: cat file.txt | tsvdims

    python3 -c 'import sys,csv
if sys.argv[1]=="":
   input=sys.stdin
else:
    input=open(sys.argv[1])
rows=-1
for row in csv.reader(input, dialect=csv.excel_tab):
    if rows==-1:
        cols=len(row)
    rows+=1
csv.writer(sys.stdout, dialect=csv.excel_tab).writerows([["Columns","Rows"],[cols,rows]])
' "$1"
}