From e2c4b2c147b8a1b0379e6836fd1b5aaddee540b5 Mon Sep 17 00:00:00 2001 From: Q Date: Fri, 29 Oct 2021 21:31:12 +0300 Subject: [PATCH] Installable TSV Filter tool --- tsv/lib/TSVFilter/TSVFilter/__init__.py | 6 + tsv/lib/TSVFilter/TSVFilter/filter.py | 192 ++++++++++++++++++++++++ tsv/lib/TSVFilter/setup.py | 15 ++ 3 files changed, 213 insertions(+) create mode 100644 tsv/lib/TSVFilter/TSVFilter/__init__.py create mode 100644 tsv/lib/TSVFilter/TSVFilter/filter.py create mode 100644 tsv/lib/TSVFilter/setup.py diff --git a/tsv/lib/TSVFilter/TSVFilter/__init__.py b/tsv/lib/TSVFilter/TSVFilter/__init__.py new file mode 100644 index 0000000..497d38d --- /dev/null +++ b/tsv/lib/TSVFilter/TSVFilter/__init__.py @@ -0,0 +1,6 @@ +from TSVFilter.filter import TSVFilter + + +def main(): + + TSVFilter() diff --git a/tsv/lib/TSVFilter/TSVFilter/filter.py b/tsv/lib/TSVFilter/TSVFilter/filter.py new file mode 100644 index 0000000..4b281ba --- /dev/null +++ b/tsv/lib/TSVFilter/TSVFilter/filter.py @@ -0,0 +1,192 @@ +import sys +import csv +import os +import re +from argparse import ArgumentParser + + +class TSVFilter: + def __init__(self): + + self.get_options() + self.parse_columns() + self.parse_filters() + self.process() + + def get_options(self): + + parser = ArgumentParser() + parser.add_argument( + "-f", + action="store", + dest="num_filters", + default=None, + help="Comma separated list of floating point filters as: 'key[operator]value'. Valid operators are <, <=, >, >=, != and ==. Example: -f 'column1<4,column1>=0,column2==10'", + ) + parser.add_argument( + "-s", + action="store", + dest="str_filters", + default=None, + help="Comma separated list of string filters as: 'key[operator]value'. Valid operators are !=, == and ~=. The ~= is a regex fullmatch operator. Example: -s 'column1==value,column2!=othervalue,column3~=M[0-9]+'", + ) + parser.add_argument( + "-c", + action="store", + dest="columns", + default=None, + help="Comma separated list of column names to output. If empty, all columns are included.", + ) + parser.add_argument( + "-d", + action="store", + dest="delimiter", + default="\t", + help="Delimiter: defaults to tab.", + ) + parser.add_argument( + "file", + action="store", + help="Filename to process. If '-', stdin used.", + ) + self.options_parser = parser + self.options = parser.parse_args() + + def parse_columns(self): + + if self.options.columns: + self.columns = [c.strip() for c in self.options.columns.split(",")] + else: + self.columns = None + + def parse_filters(self): + def lt(value1, value2): + return value1 < value2 + + def le(value1, value2): + return value1 <= value2 + + def gt(value1, value2): + return value1 > value2 + + def ge(value1, value2): + return value1 >= value2 + + def ne(value1, value2): + return value1 != value2 + + def eq(value1, value2): + return value1 == value2 + + def reg(value1, expression): + return bool(re.fullmatch(expression, value1)) + + foperators = { + "<": lt, + "<=": le, + ">=": ge, + ">": gt, + "==": eq, + "!=": ne, + } + soperators = { + "==": eq, + "!=": ne, + "~=": reg, + } + + self.filters = None + + if self.options.num_filters or self.options.str_filters: + self.filters = [] + + try: + if self.options.num_filters: + for f in self.options.num_filters.split(","): + col, op, value = re.findall("(.+)(<=|>=|<|>|!=|==)(.+)", f.strip())[ + 0 + ] + value = try_num(value, force_num=True) + self.filters.append( + { + "col": col, + "op": foperators[op], + "value": value, + "numeric": True, + } + ) + + if self.options.str_filters: + for f in self.options.str_filters.split(","): + col, op, value = re.findall("(.+)(!=|==|~=)(.+)", f.strip())[0] + self.filters.append( + { + "col": col, + "op": soperators[op], + "value": value, + "numeric": False, + } + ) + except Exception: + self.options_parser.print_help() + sys.stderr.write("\nCannot parse filter: {}\n".format(f)) + sys.exit(1) + + def process(self): + + if self.options.file == "-": + fp = sys.stdin + else: + fp = open(self.options.file, "rt") + + reader = csv.DictReader(fp, delimiter=self.options.delimiter) + fieldnames = reader.fieldnames + if self.columns: + fieldnames = self.columns + for c in self.columns: + if c not in reader.fieldnames: + raise ValueError("No such column '{}'".format(c)) + + writer = csv.DictWriter( + sys.stdout, + fieldnames=fieldnames, + quoting=csv.QUOTE_NONNUMERIC, + delimiter=self.options.delimiter, + ) + writer.writeheader() + try: + for row in reader: + printrow = type(self.filters) == type(None) + if self.filters: + matches = [] + for filt in self.filters: + if filt["numeric"]: + comp_value = try_num(row[filt["col"]], force_num=True) + else: + comp_value = row[filt["col"]] + matches.append(filt["op"](comp_value, filt["value"])) + if all(matches): + printrow = True + if printrow: + row = { + col: try_num(str(row[col])) for col in row if col in fieldnames + } + writer.writerow(row) + except BrokenPipeError: + return + + +def try_num(s, force_num=False): + + try: + return int(s) + except ValueError: + pass + try: + return float(s) + except ValueError: + pass + if force_num: + raise ValueError("Value '{}' can not be converted to numeric".format(s)) + + return s diff --git a/tsv/lib/TSVFilter/setup.py b/tsv/lib/TSVFilter/setup.py new file mode 100644 index 0000000..bcf9034 --- /dev/null +++ b/tsv/lib/TSVFilter/setup.py @@ -0,0 +1,15 @@ +from distutils.core import setup +setup( + name = 'TSVFilter', + packages = ['TSVFilter'], + version = '1.0', + description = 'TSV column filter.', + author = 'Ville Rantanen', + author_email = 'ville.q.rantanen@gmail.com', + keywords = ['TSV', 'data'], + entry_points = { + "console_scripts": [ + "TSVFilter=TSVFilter:main", + ], + }, +)