manual tsvkit updater

2016-11-17 12:30:01 +02:00
parent e0e719aa1f
commit eb98ff014c
6 changed files with 468 additions and 0 deletions
--- a/tsv/tsvkit.sh
+++ b/tsv/tsvkit.sh
@@ -0,0 +1,257 @@
+## Tab sebarated file functions, wrappers for csvkit
+# To use the tsv- commands, install csvkit: <pre>sudo apt-get install python-pip; sudo pip install csvkit</pre>
+
+TSVDIR=$( dirname "${BASH_SOURCE[0]}" )
+
+function c2t {
+# Convert comma separated stream in to tab separated stream
+# Usage: echo "foo,bar" | c2t
+
+    python -c 'import sys,csv
+try:
+    csv.writer(sys.stdout, dialect=csv.excel_tab, lineterminator="\n").writerows(csv.reader(sys.stdin, dialect=csv.excel))
+except IOError:
+    pass'
+}
+
+function header {
+# Print only the first line of input 
+# Usage:  header file.csv
+# Usage:  cat file.csv | header 
+
+    head -n 1 "$@"
+}
+
+function noheader {
+# Strip first row of input 
+# Usage:  noheader file.csv
+# Usage:  cat file.csv | noheader
+
+    tail -n +2 "$@"
+}
+
+function tsvecho {
+# Echo with tab separated values, quoted
+# Usage: tsvecho value1 value2 "some value" > header.csv
+# Usage: echo value1 value2 | tsvecho 
+
+    local HEAD
+    [[ -t 0 ]] && {
+        printf -v HEAD "\"%s\"\t" "$@"
+    } || {
+        printf -v HEAD "\"%s\"\t" "$@" $( cat - )
+    }
+    echo "${HEAD%?}"
+}
+
+function tsvstrip {
+# Strip tsv of quotes
+# Usage: cat file.csv | tsvstrip
+
+    python -c 'import sys,csv
+try:
+    csv.writer(sys.stdout, dialect=csv.excel_tab, quoting=csv.QUOTE_NONE).writerows(csv.reader(sys.stdin, dialect=csv.excel_tab))
+except IOError:
+    pass'
+}
+
+function tsvtranspose {
+# Transpose a tsv file
+# Usage: cat file.csv | tsvtranspose
+
+    python -c 'import sys,csv
+try:
+    csv.writer(sys.stdout, dialect=csv.excel_tab, quoting=csv.QUOTE_NONE).writerows(map(None,*csv.reader(sys.stdin, dialect=csv.excel_tab)))
+except IOError:
+    pass'
+}
+
+function tsvhead {
+# Head a file, but print also the header. header not counted in line numbers
+# Usage: cat file | tsvhead -n 30
+
+    if [ -t 0 ]; then
+        python "${TSVDIR}"/lib/tsvhead "$@"
+    else
+        cat - | python "${TSVDIR}"/lib/tsvhead "$@"
+    fi
+}
+
+function tsvtail {
+# Tail a file, but print also the header. header not counted in line numbers
+# Usage: cat file | tsvtail -n 30
+
+    if [ -t 0 ]; then
+        python "${TSVDIR}"/lib/tsvtail "$@"
+    else
+        cat - | python "${TSVDIR}"/lib/tsvtail "$@"
+    fi
+}
+
+which "csvjoin" > /dev/null && {
+
+function tsvcut {
+# csvcut with tab-delimited dialect, see original script for options
+# Usage: tsvcut -c Col1,Col3 input1.tsv
+
+    
+    csvcut -t "$@" | c2t
+}
+
+function tsvformat {
+# csvformat with tab-delimited dialect, see original script for options
+# Usage: tsvformat -c Col2 -m searchString input1.tsv
+
+    
+    csvformat -t -T "$@"
+}
+
+function tsvgrep {
+# csvgrep with tab-delimited dialect, see original script for options
+# Usage: tsvgrep -c Col2 -m searchString input1.tsv
+
+    
+    csvgrep -t "$@" | c2t
+}
+
+function tsvjoin {
+# csvjoin with tab-delimited dialect, see original script for options
+# Usage: tsvjoin -c 1,1 input1.tsv input2.tsv
+
+    
+    csvjoin -t "$@" | c2t
+}
+
+function tsvlook {
+# csvlook with tab-delimited dialect, see original script for options
+# Usage: tsvlook file1.tsv
+
+    
+    csvlook -t "$@"
+}
+
+function tsvquery {
+# Simple SQL query for tab-delimited files. The tables are  named tsv1,tsv2 ...,
+# unless named with NAME=FILE syntax. If defining database with -d
+# it is kept for later queries. A query is not optional(!), but it can be
+# an empty string.
+# Usage: tsvquery mytable=file1.tsv file2.tsv "SELECT * FROM tsv1,mytable WHERE C4 not like '%NA%' ORDER BY C4 DESC"
+# Usage: tsquery -d DB.sqlite tsv1=file1.tsv "SELECT * FROM tsv1;" # Will keep the database file, reusable later
+
+    local DBTEMP
+    local i
+    for (( i=1; i<=$(($#)); i++ ))
+    do [ "${!i}" = "-h" ] && {
+        echo 'Usage:
+    tsvquery [-d database.sqlite] tableName=data.tsv otherTable=data2.tsv "SQL QUERY"
+    When defining a database with -d it is kept, and can be
+    inserted with more data later. Otherwise the DB is created in /tmp/
+    and deleted afterwards.
+    If not using name=data.tsv syntax, tables are named tsv1, tsv2...  
+    Note: You have to give an SQL query. If you just want to 
+          populate a database, add " "  as an empty query.
+        '
+        return 0
+        }
+       [ "${!i}" = "-d" ] && {
+        # User defined data base
+        local j
+        j=$(( $i + 1 ))
+        DBTEMP="${!j}"
+        shift 2
+       }
+    done
+    [ -z "$j" ] && DBTEMP=$( mktemp )
+    for (( i=1; i<=$(($#-1)); i++ ))
+    do [ -f "${!i}" ] && {
+           # Add table with unique numbering
+           local OLDTBLS=$( sqlite3 "$DBTEMP" ".tables" )
+           local TBLNO=1
+           while : 
+           do echo $OLDTBLS | grep tsv$TBLNO > /dev/null || break
+              TBLNO=$(( $TBLNO + 1 ))
+           done
+           cat "${!i}" | csvsql -t --db "sqlite:///$DBTEMP" --insert --table tsv$TBLNO
+       } || {
+           # Add a user named table
+           local TBL
+           local FIL
+           TBL=$( echo ${!i} | sed 's,=.*,,' )
+           FIL=$( echo ${!i} | sed "s,^$TBL=,," )
+           [ -f "$FIL" ] && { 
+               cat "$FIL" | csvsql -t --db "sqlite:///$DBTEMP" --insert --table "$TBL"
+           } || {
+               echo File "${!i}" not found
+               rm -f "$DBTEMP"
+               return 1
+           }
+       }
+    done
+    sqlite3 -list -separator '	' -nullvalue NA -header -batch "$DBTEMP" "${@: -1}"
+    local EC=$?
+    # remove DB if using temporary
+    [ -z "$j" ] && {
+        rm -f "$DBTEMP" 
+    }
+    return $EC
+}
+
+function tsvsort {
+# csvsort with tab-delimited dialect, see original script for options
+# Usage: tsvsort -c Col3 input.tsv
+
+    
+    csvsort -t "$@" | c2t
+}
+
+function tsvstack {
+# csvstack with tab-delimited dialect, see original script for options
+# Usage: tsvstack file1.tsv file2.tsv
+
+    
+    csvstack -t "$@" | c2t
+}
+
+} || {
+    CSVKITERROR="no csvkit installed.  [sudo pip install csvkit]"
+    tsvjoin () { echo $CSVKITERROR; return 1; }
+    tsvcut () { echo $CSVKITERROR; return 1; }
+    tsvgrep () { echo $CSVKITERROR; return 1; }
+    tsvquery () { echo $CSVKITERROR; return 1; }
+    tsvsort () { echo $CSVKITERROR; return 1; }
+    tsvstack () { echo $CSVKITERROR; return 1; }
+    tsvlook () { echo $CSVKITERROR; return 1; }
+}
+
+function tsvfold {
+# Folds a sequence of line separated arguments into a TSV table with a header you specify,
+# which means it may easily cause aliasing if you have the wrong number of them.
+# Any loop which uses "print" to produce field values should ideally work.
+# Usage: cat data | tsv Column1 Column2 ...
+# Usage: seq 100 | tsvfold First Second Third Fourth
+
+
+    echo "" | tsvecho "$@"
+    if ! [ -t 0 ]; then
+        mawk -F $'\t' '{ORS=(NR%'$#'?FS:RS)}1'
+    fi
+}
+
+function tsvdims {
+# Print dimensions of a TSV
+# Usage: tsvdims file.txt
+# Usage: cat file.txt | tsvdims 
+
+    python -c 'import sys,csv
+if sys.argv[1]=="":
+   input=sys.stdin
+else:
+    input=open(sys.argv[1])
+rows=-1
+for row in csv.reader(input, dialect=csv.excel_tab):
+    if rows==-1:
+        cols=len(row)
+    rows+=1
+csv.writer(sys.stdout, dialect=csv.excel_tab).writerows([["Columns","Rows"],[cols,rows]])
+' "$1"
+}