#!/usr/bin/env python3 # -*- coding: latin-1 -*- from builtins import zip from builtins import str import sys import os import re import sqlite3 import subprocess import hashlib # import magic from argparse import ArgumentParser import configparser import io import datetime SQLFILE = "list_of_files.sqlite" IMGMATCH = re.compile(".*\.jpg$|.*\.jpeg$|.*\.png$", re.I) BADDIRS = [] MINSIZE = 0 # MIME=magic.open(magic.MAGIC_NONE) ##MIME=magic.open(magic.MAGIC_MIME) # MIME.load() ANIM = [".", "·", "'", "'", "·", ".", "_"] DEFAULT_CHUNK = 1024 * 1024 * 50 def setup_options(): parser = ArgumentParser(description="Maintains the list of images sqlite file") parser.add_argument( "-a", action="store_false", dest="add", default=True, help="Do not add new files [%(default)s]", ) parser.add_argument( "-c", action="store_true", dest="changed", default=False, help="Modify changed files [%(default)s]", ) parser.add_argument( "--check", action="store_true", dest="check", default=False, help="Check md5sums of files. Limit check with -s.", ) parser.add_argument( "-d", action="store_true", dest="delete", default=False, help="Delete non-existing entries [%(default)s]", ) parser.add_argument( "--du", type=str, action="store", dest="diskused", default=False, help="Print directory sizes. Argument is the path where directories are listed from.", ) parser.add_argument( "--du-depth", type=str, action="store", dest="diskused_depth", default=1, help="Depth of summarization for --du.", ) parser.add_argument( "--dup", action="store_true", dest="duplicate", default=False, help="Return a list of duplicate files, based on hashes. This option will flip the 'Add new files' option. [%(default)s]", ) parser.add_argument( "--dup-order", action="store", dest="duplicate_order", default="path", help="Order duplicates by a method. (length = path str length)", choices=("age", "length", "file", "path"), ) parser.add_argument( "--haschanges", action="store_true", dest="haschanges", default=False, help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.", ) parser.add_argument( "--hasdeletions", action="store_true", dest="hasdeletions", default=False, help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.", ) parser.add_argument( "--hasadditions", action="store_true", dest="hasadditions", default=False, help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.", ) parser.add_argument( "-f", action="store", dest="sqlfile", default=SQLFILE, help="SQL file name to use [%(default)s]", ) parser.add_argument( "-l", action="store_true", dest="symlinks", default=False, help="Follow symbolic links [%(default)s]", ) parser.add_argument( "--match", type=str, dest="match", default=False, help="Search for closest match from basenames, can be helped with adding -s", ) parser.add_argument( "-s", type=str, action="append", dest="search", default=[], help="Search list based on path pattern", ) parser.add_argument( "-x", action="append", dest="exclude", default=[], help="Exclude folder name from the lists. This option may be issued several times", ) parser.add_argument( "--full", action="store_true", dest="fullfile", default=False, help="ONLY FOR NEW DB CREATION. Use full files to calculate md5 checksum. Defaults to first 50Mb. [%(default)s]", ) parser.add_argument( "--relative", action="store_true", dest="relative", default=False, help="ONLY FOR NEW DB CREATION. Store filenames relative to database file.", ) parser.add_argument("startpath", action="store", default=".", nargs="?") options = parser.parse_args() BADDIRS.extend(options.exclude) if options.duplicate: options.add = not options.add options.sqlpath = os.path.dirname(os.path.realpath(options.sqlfile)) return options def add_recurse(options): conn = sqlite3.connect(options.sqlfile) conn.text_factory = str db = conn.cursor() prev_path_len = 0 for path, dirs, files in os.walk(options.startpath, followlinks=options.symlinks): sys.stdout.write( ( "\r%s%s" % (filename_join(path, ".", options), (prev_path_len - len(path)) * " ") ) ) prev_path_len = len(path) dirs = clean_dirs(dirs) dirs.sort() files.sort() db_files = get_folder_contents(db, filename_join(path, "", options) + "/") if not options.symlinks: files = clean_syms(files, path) for file in files: filename = filename_join(path, file, options) if file == options.sqlfile: continue if not os.path.isfile(filename): continue # if not is_listed(db,filename): if file not in db_files: if options.add: add_single(conn, filename, change=False, fullfile=options.fullfile) else: if options.changed: ftime = os.path.getmtime(filename) if not ftime_match(db, filename, ftime): # file content changed add_single( conn, filename, change=True, fullfile=options.fullfile ) conn.commit() sys.stdout.write("\n") return def add_single(conn, filename, change=False, hash=None, minsize=0, fullfile=False): try: fsize = os.path.getsize(filename) hsize = humanize_size(fsize) except IOError: hsize = "" print("\r%s (%s)" % (filename, hsize)) db = conn.cursor() try: if hash == None: hash = get_md5(filename, fullfile) ftime = os.path.getmtime(filename) except IOError: print("File '%s' not found. Bad link?" % (filename,)) return if change: db.execute( "UPDATE list SET date=?, hash=?, size=? \ WHERE file=?", (ftime, hash, fsize, filename), ) # print "changing: %(f)s " % {'f':filename} else: db.execute( "INSERT INTO list(file,date,hash,size)\ VALUES(?,?,?,?)", (filename, ftime, hash, fsize), ) sys.stdout.write("\r") return def checkdb(options): needle = options.search if len(needle) == 0: needle.append("%") needle = ["%" + i + "%" for i in needle] like_query = " OR ".join(["file LIKE ?" for i in needle]) conn = sqlite3.connect(options.sqlfile) conn.text_factory = str db = conn.cursor() db.execute( "SELECT file,hash,size,date FROM list \ WHERE " + like_query + " ORDER BY file", needle, ) missing = [] differing = [] OK_count = 0 for row in db: status = "OK" sys.stdout.write("\r%s" % (row[0],)) if os.path.exists(row[0]): md5f = get_md5(row[0], options.fullfile) if row[1] != md5f: status = "Checksum-difference" differing.append(row) else: status = "Not-found" missing.append(row) sys.stdout.write("\r%s %s\n" % (row[0], status)) if status == "OK": OK_count += 1 if len(differing) > 0: print_stderr("----\nDiffering files:") pad = str(max([len(x[0]) for x in differing])) for f in differing: print( ("%-" + pad + "s (%s %7s => %s %7s)") % ( f[0], humanize_date(f[3]), humanize_size(f[2]), humanize_date(os.path.getmtime(f[0])), humanize_size(os.path.getsize(f[0])), ) ) if len(missing) > 0: print("----\nMissing files:") pad = str(max([len(x[0]) for x in missing])) for f in missing: print( ("%-" + pad + "s (%s %7s)") % (f[0], humanize_date(f[3]), humanize_size(f[2])) ) (added, changed) = has_changes_additions(db, options, False) if len(added) > 0: print("----\nAdded files:") pad = str(max([len(x[0]) for x in added])) for f in added: print( ("%-" + pad + "s (%s %7s)") % ( f, humanize_date(os.path.getmtime(f)), humanize_size(os.path.getsize(f)), ) ) print("----\nFile check summary:") print("Database modified: %s" % (humanize_date(os.path.getmtime(options.sqlfile)),)) print("Checksum matches : %d" % (OK_count,)) print("Checksum mismatch: %d" % (len(differing),)) print("Files missing : %d" % (len(missing),)) print("Files added : %d" % (len(added),)) def clean_dirs(dirs): for s in dirs[:]: if (s in BADDIRS) or (s.startswith(".")): dirs.remove(s) return dirs def clean_syms(files, path): nonsyms = [] for f in files: if not os.path.islink(os.path.join(path, f)): nonsyms.append(f) return nonsyms def createdb(options): conn = sqlite3.connect(options.sqlfile) db = conn.cursor() conn.text_factory = str db.execute( "CREATE TABLE list (id INTEGER PRIMARY KEY AUTOINCREMENT,\ file TEXT,date INTEGER, hash TEXT,\ size INTEGER, mime TEXT)" ) db.execute( "CREATE TABLE config (id INTEGER PRIMARY KEY AUTOINCREMENT,\ object TEXT)" ) conn.commit() config = configparser.RawConfigParser() config.add_section("General") config.set("General", "Relative", str(options.relative)) config.set("General", "FullFile", str(options.fullfile)) store = io.StringIO() config.write(store) db.execute("INSERT INTO config (object) values (?)", (store.getvalue(),)) conn.commit() return def delete_nonexisting(sqlfile, options): conn = sqlite3.connect(sqlfile) conn.text_factory = str db = conn.cursor() dbdel = conn.cursor() db.execute("SELECT file FROM list") for row in db: if os.path.exists(row[0]): delete = False if not options.symlinks: if os.path.islink(row[0]): delete = True else: delete = True if delete: print("removing.. " + row[0]) dbdel.execute("DELETE FROM list where file == ?", (row[0],)) conn.commit() return def disk_used(options): conn = sqlite3.connect(options.sqlfile) conn.text_factory = str db = conn.cursor() checkpath = filename_join(options.diskused, "", options) + "/" if checkpath == "./": checkpath = "" db.execute( 'SELECT size,replace(file,?,"") as path FROM list WHERE file LIKE ?', ( checkpath, checkpath + "%", ), ) entries = [] sizes = [] for row in db: start_path = row[1].split("/") start_path = "/".join(start_path[0 : int(options.diskused_depth)]) if start_path not in entries: entries.append(start_path) sizes.append(row[0]) else: sizes[entries.index(start_path)] += row[0] for entry in zip(sizes, entries): print( "| ".join( [str(entry[0]).ljust(14), humanize_size(entry[0]).rjust(8), entry[1]] ) ) def filename_join(path, name, options): filename = os.path.realpath(os.path.join(path, name)) if options.relative: return os.path.relpath(filename, options.sqlpath) return filename def find_duplicates(sqlfile, order): conn = sqlite3.connect(sqlfile) conn.text_factory = str db = conn.cursor() dbh = conn.cursor() db.execute( "SELECT hash,count(*) FROM list WHERE size > 0 GROUP BY hash HAVING count(*) > 1 " ) duphash = [] for row in db: hash = row[0] dbh.execute("SELECT file,size,date FROM list WHERE hash = ?", (hash,)) flist = [] for row in dbh: flist.append(row) sort_by_method(flist, order) duphash.append((hash, flist)) duphash.sort(key=lambda file: file[1][0]) return duphash def ftime_match(db, filename, ftime): db.execute("SELECT date FROM list where file == ?", (filename,)) count = db.fetchall() return count[0][0] == ftime def get_folder_contents(db, path): """return the contents of the folder""" files = [] if path == "./": db.execute("SELECT file FROM list where file NOT LIKE ?", ("%/%",)) path = "" else: db.execute("SELECT file FROM list where file LIKE ?", (path + "%",)) for row in db: try: base = row[0].replace(path, "", 1) except UnicodeDecodeError: print(row[0] + " is giving me trouble.") try: base = row[0].encode("utf-8").replace(path, "", 1) except UnicodeDecodeError: print(row[0] + " is still giving me trouble.") sys.exit(1) if base.find("/") == -1: files.append(base) return files def get_md5(filename, fullfile=False): """returns content based hash, only first 50Mb is read, unless user wants the whole file""" fsize = os.path.getsize(filename) if fullfile and fsize > DEFAULT_CHUNK: anim_i = 0 anim_len = len(ANIM) block_size = 2**24 percents_per_block = int(100 / (float(fsize) / block_size)) md5 = hashlib.md5() with open(filename, "rb") as f: for chunk in iter(lambda: f.read(block_size), b""): sys.stderr.write( "\r %s (%02d%%)" % (ANIM[anim_i % anim_len], int(anim_i * percents_per_block)) ) sys.stderr.flush() anim_i += 1 md5.update(chunk) sys.stderr.write("\r ") return md5.hexdigest() return hashlib.md5(open(filename, "rb").read(DEFAULT_CHUNK)).hexdigest() def has_changes(options): conn = sqlite3.connect(options.sqlfile) conn.text_factory = str db = conn.cursor() if options.haschanges: options.changed = True if options.hasdeletions or options.haschanges: has_changes_deleted(db) if options.hasadditions or options.haschanges: has_changes_additions(db, options) def has_changes_deleted(db, exit=True): db.execute("SELECT file FROM list") deleted = [] for row in db: if not os.path.exists(row[0]): if exit: print("True") sys.exit(1) else: deleted.append(row[0]) return deleted def has_changes_additions(db, options, exit=True): added = [] changed = [] for path, dirs, files in os.walk(options.startpath, followlinks=options.symlinks): dirs = clean_dirs(dirs) db_files = get_folder_contents(db, filename_join(path, "", options) + "/") if not options.symlinks: files = clean_syms(files, path) for file in files: filename = filename_join(path, file, options) if file == options.sqlfile: continue # if not is_listed(db,filename): if file not in db_files: if exit: print("True") sys.exit(1) else: added.append(filename) else: if options.changed: ftime = os.path.getmtime(filename) if not ftime_match(db, filename, ftime): # file content changed if exit: print("True") sys.exit(1) else: changed.append(filename) return (added, changed) # ~ def hash_match(db,filename,hash): # ~ db.execute("SELECT hash FROM list where file == ?",(filename,)) # ~ count=db.fetchall() # ~ return count[0][0]==hash def humanize_date(date): if date == None: return "" return datetime.datetime.fromtimestamp(int(date)).strftime("%Y-%m-%d %H:%M:%S") def humanize_size(size, precision=1): if size == None: return "nan" suffixes = ["B", "KB", "MB", "GB", "TB"] suffixIndex = 0 defPrecision = 0 while size > 1024: suffixIndex += 1 # increment the index of the suffix size = float(size / 1024.0) # apply the division defPrecision = precision return "%.*f%s" % (defPrecision, size, suffixes[suffixIndex]) def is_listed(db, filename): db.execute("SELECT COUNT(*) FROM list where file == ?", (filename,)) count = db.fetchall() return count[0][0] > 0 def matchdb(sqlfile, needle, helper): needle = needle.lower() import difflib as dl conn = sqlite3.connect(sqlfile) conn.text_factory = str db = conn.cursor() if len(helper) > 0: helper = ["%" + i + "%" for i in helper] like_query = " OR ".join(["file LIKE ?" for i in helper]) db.execute( "SELECT file FROM list WHERE " + like_query + " ORDER BY date DESC", helper ) else: db.execute("SELECT file FROM list ORDER BY date DESC") ratio = 0 best_match = "" for row in db: s = dl.SequenceMatcher(None, os.path.basename(row[0]).lower(), needle) s_ratio = s.ratio() if ratio < s_ratio: ratio = s_ratio best_match = row[0] print(best_match) def print_duplicates(files): for hash in files: # print(hash[0]) i = 1 for f in hash[1]: print( "%(i)d|%(s)s|%(d)s|%(f)s " % { "i": i, "f": f[0], "d": humanize_date(f[2]), "s": humanize_size(f[1]), } ) i += 1 return def print_stderr(s): sys.stderr.write(s) sys.stderr.write("\n") sys.stderr.flush() def searchdb(sqlfile, needle): needle = ["%" + i + "%" for i in needle] like_query = " OR ".join(["file LIKE ?" for i in needle]) conn = sqlite3.connect(sqlfile) conn.text_factory = str db = conn.cursor() db.execute("SELECT file FROM list WHERE " + like_query + " ORDER BY file", needle) for row in db: print(row[0]) def sort_by_method(flist, order): if order == "path": flist.sort(key=lambda file: file[0]) if order == "file": flist.sort(key=lambda file: os.path.basename(file[0])) if order == "age": flist.sort(key=lambda file: file[2]) if order == "length": flist.sort(key=lambda file: len(file[0])) def stored_options(options): try: conn = sqlite3.connect(options.sqlfile) db = conn.cursor() conn.text_factory = str db.execute("SELECT object FROM config") store = "" for row in db: store += row[0] + "\n" config = configparser.RawConfigParser() config.read_file(io.StringIO(store)) options.relative = config.getboolean("General", "Relative") options.fullfile = config.getboolean("General", "FullFile") except Exception as e: pass return options def main(): options = setup_options() if not os.path.exists(options.sqlfile): createdb(options) options = stored_options(options) if options.relative: os.chdir(options.sqlpath) if options.haschanges or options.hasadditions or options.hasdeletions: has_changes(options) sys.exit(0) if options.check: checkdb(options) sys.exit(0) if len(options.search) > 0 and not options.match: searchdb(options.sqlfile, options.search) sys.exit(0) if options.match: matchdb(options.sqlfile, options.match, options.search) sys.exit(0) if options.diskused: disk_used(options) sys.exit(0) if options.delete: print("Deleting entries...") delete_nonexisting(options.sqlfile, options) if options.add or options.changed: print("Adding " + options.startpath + " entries...") add_recurse(options) if options.duplicate: files = find_duplicates(options.sqlfile, options.duplicate_order) print_duplicates(files) sys.exit(0) main()