## Tab sebarated file functions, wrappers for csvkit # To use the tsv- commands, install csvkit:
sudo apt-get install python-pip; sudo pip install csvkitTSVDIR=$( dirname "${BASH_SOURCE[0]}" ) function c2t { # Convert comma separated stream in to tab separated stream # Usage: echo "foo,bar" | c2t python3 -c 'import sys,csv try: csv.writer(sys.stdout, dialect=csv.excel_tab, lineterminator="\n").writerows(csv.reader(sys.stdin, dialect=csv.excel)) except IOError: pass' } function header { # Print only the first line of input # Usage: header file.csv # Usage: cat file.csv | header head -n 1 "$@" } function noheader { # Strip first row of input # Usage: noheader file.csv # Usage: cat file.csv | noheader tail -n +2 "$@" } function tsvecho { # Echo with tab separated values, quoted # Usage: tsvecho value1 value2 "some value" > header.csv # Usage: echo value1 value2 | tsvecho local HEAD [[ -t 0 ]] && { printf -v HEAD "\"%s\"\t" "$@" } || { printf -v HEAD "\"%s\"\t" "$@" $( cat - ) } echo "${HEAD%?}" } function tsvstrip { # Strip tsv of quotes # Usage: cat file.csv | tsvstrip python3 -c 'import sys,csv try: csv.writer(sys.stdout, dialect=csv.excel_tab, quoting=csv.QUOTE_NONE).writerows(csv.reader(sys.stdin, dialect=csv.excel_tab)) except IOError: pass' } function tsvtranspose { # Transpose a tsv file # Usage: cat file.csv | tsvtranspose python3 -c 'import sys,csv try: csv.writer(sys.stdout, dialect=csv.excel_tab, quoting=csv.QUOTE_NONE).writerows(map(None,*csv.reader(sys.stdin, dialect=csv.excel_tab))) except IOError: pass' } function tsvhead { # Head a file, but print also the header. header not counted in line numbers # Usage: cat file | tsvhead -n 30 if [ -t 0 ]; then python3 "${TSVDIR}"/lib/tsvhead "$@" else cat - | python3 "${TSVDIR}"/lib/tsvhead "$@" fi } function tsvtail { # Tail a file, but print also the header. header not counted in line numbers # Usage: cat file | tsvtail -n 30 if [ -t 0 ]; then python3 "${TSVDIR}"/lib/tsvtail "$@" else cat - | python3 "${TSVDIR}"/lib/tsvtail "$@" fi } which "csvjoin" > /dev/null && { function tsvcut { # csvcut with tab-delimited dialect, see original script for options # Usage: tsvcut -c Col1,Col3 input1.tsv csvcut -t "$@" | c2t } function tsvformat { # csvformat with tab-delimited dialect, see original script for options # Usage: tsvformat -c Col2 -m searchString input1.tsv csvformat -t -T "$@" } function tsvgrep { # csvgrep with tab-delimited dialect, see original script for options # Usage: tsvgrep -c Col2 -m searchString input1.tsv csvgrep -t "$@" | c2t } function tsvjoin { # csvjoin with tab-delimited dialect, see original script for options # Usage: tsvjoin -c 1,1 input1.tsv input2.tsv csvjoin -t "$@" | c2t } function tsvlook { # csvlook with tab-delimited dialect, see original script for options # Usage: tsvlook file1.tsv csvlook -t "$@" } function tsvquery { # Simple SQL query for tab-delimited files. The tables are named tsv1,tsv2 ..., # unless named with NAME=FILE syntax. If defining database with -d # it is kept for later queries. A query is not optional(!), but it can be # an empty string. # Usage: tsvquery mytable=file1.tsv file2.tsv "SELECT * FROM tsv1,mytable WHERE C4 not like '%NA%' ORDER BY C4 DESC" # Usage: tsquery -d DB.sqlite tsv1=file1.tsv "SELECT * FROM tsv1;" # Will keep the database file, reusable later local DBTEMP local i for (( i=1; i<=$(($#)); i++ )) do [ "${!i}" = "-h" ] && { echo 'Usage: tsvquery [-d database.sqlite] tableName=data.tsv otherTable=data2.tsv "SQL QUERY" When defining a database with -d it is kept, and can be inserted with more data later. Otherwise the DB is created in /tmp/ and deleted afterwards. If not using name=data.tsv syntax, tables are named tsv1, tsv2... Note: You have to give an SQL query. If you just want to populate a database, add " " as an empty query. ' return 0 } [ "${!i}" = "-d" ] && { # User defined data base local j j=$(( $i + 1 )) DBTEMP="${!j}" shift 2 } done [ -z "$j" ] && DBTEMP=$( mktemp ) for (( i=1; i<=$(($#-1)); i++ )) do [ -f "${!i}" ] && { # Add table with unique numbering local OLDTBLS=$( sqlite3 "$DBTEMP" ".tables" ) local TBLNO=1 while : do echo $OLDTBLS | grep tsv$TBLNO > /dev/null || break TBLNO=$(( $TBLNO + 1 )) done cat "${!i}" | csvsql -t --db "sqlite:///$DBTEMP" --insert --table tsv$TBLNO } || { # Add a user named table local TBL local FIL TBL=$( echo ${!i} | sed 's,=.*,,' ) FIL=$( echo ${!i} | sed "s,^$TBL=,," ) [ -f "$FIL" ] && { cat "$FIL" | csvsql -t --db "sqlite:///$DBTEMP" --insert --table "$TBL" } || { echo File "${!i}" not found rm -f "$DBTEMP" return 1 } } done sqlite3 -list -separator ' ' -nullvalue NA -header -batch "$DBTEMP" "${@: -1}" local EC=$? # remove DB if using temporary [ -z "$j" ] && { rm -f "$DBTEMP" } return $EC } function tsvsort { # csvsort with tab-delimited dialect, see original script for options # Usage: tsvsort -c Col3 input.tsv csvsort -t "$@" | c2t } function tsvstack { # csvstack with tab-delimited dialect, see original script for options # Usage: tsvstack file1.tsv file2.tsv csvstack -t "$@" | c2t } } || { CSVKITERROR="no csvkit installed. [sudo pip install csvkit]" tsvjoin () { echo $CSVKITERROR; return 1; } tsvcut () { echo $CSVKITERROR; return 1; } tsvgrep () { echo $CSVKITERROR; return 1; } tsvquery () { echo $CSVKITERROR; return 1; } tsvsort () { echo $CSVKITERROR; return 1; } tsvstack () { echo $CSVKITERROR; return 1; } tsvlook () { echo $CSVKITERROR; return 1; } } function tsvfold { # Folds a sequence of line separated arguments into a TSV table with a header you specify, # which means it may easily cause aliasing if you have the wrong number of them. # Any loop which uses "print" to produce field values should ideally work. # Usage: cat data | tsv Column1 Column2 ... # Usage: seq 100 | tsvfold First Second Third Fourth echo "" | tsvecho "$@" if ! [ -t 0 ]; then mawk -F $'\t' '{ORS=(NR%'$#'?FS:RS)}1' fi } function tsvdims { # Print dimensions of a TSV # Usage: tsvdims file.txt # Usage: cat file.txt | tsvdims python3 -c 'import sys,csv if sys.argv[1]=="": input=sys.stdin else: input=open(sys.argv[1]) rows=-1 for row in csv.reader(input, dialect=csv.excel_tab): if rows==-1: cols=len(row) rows+=1 csv.writer(sys.stdout, dialect=csv.excel_tab).writerows([["Columns","Rows"],[cols,rows]]) ' "$1" }