258 lines
6.9 KiB
Bash
258 lines
6.9 KiB
Bash
## Tab sebarated file functions, wrappers for csvkit
|
|
# To use the tsv- commands, install csvkit: <pre>sudo apt-get install python-pip; sudo pip install csvkit</pre>
|
|
|
|
TSVDIR=$( dirname "${BASH_SOURCE[0]}" )
|
|
|
|
function c2t {
|
|
# Convert comma separated stream in to tab separated stream
|
|
# Usage: echo "foo,bar" | c2t
|
|
|
|
python3 -c 'import sys,csv
|
|
try:
|
|
csv.writer(sys.stdout, dialect=csv.excel_tab, lineterminator="\n").writerows(csv.reader(sys.stdin, dialect=csv.excel))
|
|
except IOError:
|
|
pass'
|
|
}
|
|
|
|
function header {
|
|
# Print only the first line of input
|
|
# Usage: header file.csv
|
|
# Usage: cat file.csv | header
|
|
|
|
head -n 1 "$@"
|
|
}
|
|
|
|
function noheader {
|
|
# Strip first row of input
|
|
# Usage: noheader file.csv
|
|
# Usage: cat file.csv | noheader
|
|
|
|
tail -n +2 "$@"
|
|
}
|
|
|
|
function tsvecho {
|
|
# Echo with tab separated values, quoted
|
|
# Usage: tsvecho value1 value2 "some value" > header.csv
|
|
# Usage: echo value1 value2 | tsvecho
|
|
|
|
local HEAD
|
|
[[ -t 0 ]] && {
|
|
printf -v HEAD "\"%s\"\t" "$@"
|
|
} || {
|
|
printf -v HEAD "\"%s\"\t" "$@" $( cat - )
|
|
}
|
|
echo "${HEAD%?}"
|
|
}
|
|
|
|
function tsvstrip {
|
|
# Strip tsv of quotes
|
|
# Usage: cat file.csv | tsvstrip
|
|
|
|
python3 -c 'import sys,csv
|
|
try:
|
|
csv.writer(sys.stdout, dialect=csv.excel_tab, quoting=csv.QUOTE_NONE).writerows(csv.reader(sys.stdin, dialect=csv.excel_tab))
|
|
except IOError:
|
|
pass'
|
|
}
|
|
|
|
function tsvtranspose {
|
|
# Transpose a tsv file
|
|
# Usage: cat file.csv | tsvtranspose
|
|
|
|
python3 -c 'import sys,csv
|
|
try:
|
|
csv.writer(sys.stdout, dialect=csv.excel_tab, quoting=csv.QUOTE_NONE).writerows(map(None,*csv.reader(sys.stdin, dialect=csv.excel_tab)))
|
|
except IOError:
|
|
pass'
|
|
}
|
|
|
|
function tsvhead {
|
|
# Head a file, but print also the header. header not counted in line numbers
|
|
# Usage: cat file | tsvhead -n 30
|
|
|
|
if [ -t 0 ]; then
|
|
python3 "${TSVDIR}"/lib/tsvhead "$@"
|
|
else
|
|
cat - | python3 "${TSVDIR}"/lib/tsvhead "$@"
|
|
fi
|
|
}
|
|
|
|
function tsvtail {
|
|
# Tail a file, but print also the header. header not counted in line numbers
|
|
# Usage: cat file | tsvtail -n 30
|
|
|
|
if [ -t 0 ]; then
|
|
python3 "${TSVDIR}"/lib/tsvtail "$@"
|
|
else
|
|
cat - | python3 "${TSVDIR}"/lib/tsvtail "$@"
|
|
fi
|
|
}
|
|
|
|
which "csvjoin" > /dev/null && {
|
|
|
|
function tsvcut {
|
|
# csvcut with tab-delimited dialect, see original script for options
|
|
# Usage: tsvcut -c Col1,Col3 input1.tsv
|
|
|
|
|
|
csvcut -t "$@" | c2t
|
|
}
|
|
|
|
function tsvformat {
|
|
# csvformat with tab-delimited dialect, see original script for options
|
|
# Usage: tsvformat -c Col2 -m searchString input1.tsv
|
|
|
|
|
|
csvformat -t -T "$@"
|
|
}
|
|
|
|
function tsvgrep {
|
|
# csvgrep with tab-delimited dialect, see original script for options
|
|
# Usage: tsvgrep -c Col2 -m searchString input1.tsv
|
|
|
|
|
|
csvgrep -t "$@" | c2t
|
|
}
|
|
|
|
function tsvjoin {
|
|
# csvjoin with tab-delimited dialect, see original script for options
|
|
# Usage: tsvjoin -c 1,1 input1.tsv input2.tsv
|
|
|
|
|
|
csvjoin -t "$@" | c2t
|
|
}
|
|
|
|
function tsvlook {
|
|
# csvlook with tab-delimited dialect, see original script for options
|
|
# Usage: tsvlook file1.tsv
|
|
|
|
|
|
csvlook -t "$@"
|
|
}
|
|
|
|
function tsvquery {
|
|
# Simple SQL query for tab-delimited files. The tables are named tsv1,tsv2 ...,
|
|
# unless named with NAME=FILE syntax. If defining database with -d
|
|
# it is kept for later queries. A query is not optional(!), but it can be
|
|
# an empty string.
|
|
# Usage: tsvquery mytable=file1.tsv file2.tsv "SELECT * FROM tsv1,mytable WHERE C4 not like '%NA%' ORDER BY C4 DESC"
|
|
# Usage: tsquery -d DB.sqlite tsv1=file1.tsv "SELECT * FROM tsv1;" # Will keep the database file, reusable later
|
|
|
|
local DBTEMP
|
|
local i
|
|
for (( i=1; i<=$(($#)); i++ ))
|
|
do [ "${!i}" = "-h" ] && {
|
|
echo 'Usage:
|
|
tsvquery [-d database.sqlite] tableName=data.tsv otherTable=data2.tsv "SQL QUERY"
|
|
When defining a database with -d it is kept, and can be
|
|
inserted with more data later. Otherwise the DB is created in /tmp/
|
|
and deleted afterwards.
|
|
If not using name=data.tsv syntax, tables are named tsv1, tsv2...
|
|
Note: You have to give an SQL query. If you just want to
|
|
populate a database, add " " as an empty query.
|
|
'
|
|
return 0
|
|
}
|
|
[ "${!i}" = "-d" ] && {
|
|
# User defined data base
|
|
local j
|
|
j=$(( $i + 1 ))
|
|
DBTEMP="${!j}"
|
|
shift 2
|
|
}
|
|
done
|
|
[ -z "$j" ] && DBTEMP=$( mktemp )
|
|
for (( i=1; i<=$(($#-1)); i++ ))
|
|
do [ -f "${!i}" ] && {
|
|
# Add table with unique numbering
|
|
local OLDTBLS=$( sqlite3 "$DBTEMP" ".tables" )
|
|
local TBLNO=1
|
|
while :
|
|
do echo $OLDTBLS | grep tsv$TBLNO > /dev/null || break
|
|
TBLNO=$(( $TBLNO + 1 ))
|
|
done
|
|
cat "${!i}" | csvsql -t --db "sqlite:///$DBTEMP" --insert --table tsv$TBLNO
|
|
} || {
|
|
# Add a user named table
|
|
local TBL
|
|
local FIL
|
|
TBL=$( echo ${!i} | sed 's,=.*,,' )
|
|
FIL=$( echo ${!i} | sed "s,^$TBL=,," )
|
|
[ -f "$FIL" ] && {
|
|
cat "$FIL" | csvsql -t --db "sqlite:///$DBTEMP" --insert --table "$TBL"
|
|
} || {
|
|
echo File "${!i}" not found
|
|
rm -f "$DBTEMP"
|
|
return 1
|
|
}
|
|
}
|
|
done
|
|
sqlite3 -list -separator ' ' -nullvalue NA -header -batch "$DBTEMP" "${@: -1}"
|
|
local EC=$?
|
|
# remove DB if using temporary
|
|
[ -z "$j" ] && {
|
|
rm -f "$DBTEMP"
|
|
}
|
|
return $EC
|
|
}
|
|
|
|
function tsvsort {
|
|
# csvsort with tab-delimited dialect, see original script for options
|
|
# Usage: tsvsort -c Col3 input.tsv
|
|
|
|
|
|
csvsort -t "$@" | c2t
|
|
}
|
|
|
|
function tsvstack {
|
|
# csvstack with tab-delimited dialect, see original script for options
|
|
# Usage: tsvstack file1.tsv file2.tsv
|
|
|
|
|
|
csvstack -t "$@" | c2t
|
|
}
|
|
|
|
} || {
|
|
CSVKITERROR="no csvkit installed. [sudo pip install csvkit]"
|
|
tsvjoin () { echo $CSVKITERROR; return 1; }
|
|
tsvcut () { echo $CSVKITERROR; return 1; }
|
|
tsvgrep () { echo $CSVKITERROR; return 1; }
|
|
tsvquery () { echo $CSVKITERROR; return 1; }
|
|
tsvsort () { echo $CSVKITERROR; return 1; }
|
|
tsvstack () { echo $CSVKITERROR; return 1; }
|
|
tsvlook () { echo $CSVKITERROR; return 1; }
|
|
}
|
|
|
|
function tsvfold {
|
|
# Folds a sequence of line separated arguments into a TSV table with a header you specify,
|
|
# which means it may easily cause aliasing if you have the wrong number of them.
|
|
# Any loop which uses "print" to produce field values should ideally work.
|
|
# Usage: cat data | tsv Column1 Column2 ...
|
|
# Usage: seq 100 | tsvfold First Second Third Fourth
|
|
|
|
|
|
echo "" | tsvecho "$@"
|
|
if ! [ -t 0 ]; then
|
|
mawk -F $'\t' '{ORS=(NR%'$#'?FS:RS)}1'
|
|
fi
|
|
}
|
|
|
|
function tsvdims {
|
|
# Print dimensions of a TSV
|
|
# Usage: tsvdims file.txt
|
|
# Usage: cat file.txt | tsvdims
|
|
|
|
python3 -c 'import sys,csv
|
|
if sys.argv[1]=="":
|
|
input=sys.stdin
|
|
else:
|
|
input=open(sys.argv[1])
|
|
rows=-1
|
|
for row in csv.reader(input, dialect=csv.excel_tab):
|
|
if rows==-1:
|
|
cols=len(row)
|
|
rows+=1
|
|
csv.writer(sys.stdout, dialect=csv.excel_tab).writerows([["Columns","Rows"],[cols,rows]])
|
|
' "$1"
|
|
}
|