#!/usr/bin/env python3 # -*- coding: latin-1 -*- from builtins import zip from builtins import str import sys import os import re import sqlite3 import subprocess import hashlib import magic from argparse import ArgumentParser import configparser import io import datetime SQLFILE='list_of_files.sqlite' IMGMATCH=re.compile('.*\.jpg$|.*\.jpeg$|.*\.png$',re.I) BADDIRS=[] MINSIZE=0 MIME=magic.open(magic.MAGIC_NONE) #MIME=magic.open(magic.MAGIC_MIME) MIME.load() ANIM=['.','·',"'","'",'·','.','_'] DEFAULT_CHUNK=1024*1024*50 def setup_options(): parser=ArgumentParser(description="Maintains the list of images sqlite file") parser.add_argument("-a",action="store_false",dest="add",default=True, help="Do not add new files [%(default)s]") parser.add_argument("-c",action="store_true",dest="changed",default=False, help="Modify changed files [%(default)s]") parser.add_argument("--check",action="store_true",dest="check",default=False, help="Check md5sums of files. Limit check with -s.") parser.add_argument("-d",action="store_true",dest="delete",default=False, help="Delete non-existing entries [%(default)s]") parser.add_argument("--du",type=str,action='store',dest="diskused",default=False, help="Print directory sizes. Argument is the path where directories are listed from.") parser.add_argument("--du-depth",type=str,action='store',dest="diskused_depth",default=1, help="Depth of summarization for --du.") parser.add_argument("--dup",action="store_true",dest="duplicate",default=False, help="Return a list of duplicate files, based on hashes. This option will flip the 'Add new files' option. [%(default)s]") parser.add_argument("--dup-order",action="store",dest="duplicate_order",default='path', help = "Order duplicates by a method. (length = path str length)", choices = ('age','length','file','path') ) parser.add_argument("--haschanges",action="store_true",dest="haschanges",default=False, help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.") parser.add_argument("--hasdeletions",action="store_true",dest="hasdeletions",default=False, help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.") parser.add_argument("--hasadditions",action="store_true",dest="hasadditions",default=False, help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.") parser.add_argument("-f",action="store",dest="sqlfile",default=SQLFILE, help="SQL file name to use [%(default)s]") parser.add_argument("-l",action="store_true",dest="symlinks",default=False, help="Follow symbolic links [%(default)s]") parser.add_argument("--match",type=str,dest="match",default=False, help="Search for closest match from basenames, can be helped with adding -s") parser.add_argument("-s",type=str,action='append',dest="search",default=[], help="Search list based on path pattern") parser.add_argument("-x",action="append",dest="exclude",default=[], help="Exclude folder name from the lists. This option may be issued several times") parser.add_argument("--full",action="store_true",dest="fullfile",default=False, help="ONLY FOR NEW DB CREATION. Use full files to calculate md5 checksum. Defaults to first 50Mb. [%(default)s]") parser.add_argument("--relative",action="store_true",dest="relative",default=False, help="ONLY FOR NEW DB CREATION. Store filenames relative to database file.") parser.add_argument('startpath', action="store",default='.', nargs='?') options=parser.parse_args() BADDIRS.extend(options.exclude) if options.duplicate: options.add=not options.add options.sqlpath=os.path.dirname(os.path.realpath(options.sqlfile)) return options def add_recurse(options): conn=sqlite3.connect(options.sqlfile) conn.text_factory=str db=conn.cursor() prev_path_len=0 for path,dirs,files in os.walk(options.startpath,followlinks=options.symlinks): sys.stdout.write(("\r%s%s"%(filename_join(path,".",options),(prev_path_len-len(path))*' '))) prev_path_len=len(path) dirs=clean_dirs(dirs) dirs.sort() files.sort() db_files=get_folder_contents(db,filename_join(path,"",options)+"/") if not options.symlinks: files=clean_syms(files,path) for file in files: filename=filename_join(path,file,options) if file==options.sqlfile: continue if not os.path.isfile(filename): continue #if not is_listed(db,filename): if file not in db_files: if options.add: add_single(conn,filename,change=False,fullfile=options.fullfile) else: if options.changed: ftime=os.path.getmtime(filename) if not ftime_match(db,filename,ftime): #file content changed add_single(conn,filename,change=True,fullfile=options.fullfile) conn.commit() sys.stdout.write("\n") return def add_single(conn,filename,change=False,hash=None,minsize=0,fullfile=False): try: fsize=os.path.getsize(filename) hsize=humanize_size(fsize) except IOError: hsize="" print("\r%s (%s)"%(filename,hsize)) db=conn.cursor() try: if hash==None: hash=get_md5(filename,fullfile) ftime=os.path.getmtime(filename) mime=MIME.file(str(filename.encode('UTF-8'))) except IOError: print("File '%s' not found. Bad link?"%(filename,)) return except (UnicodeDecodeError, TypeError): mime="NA" if change: db.execute("UPDATE list SET date=?, hash=?, size=?, mime=? \ WHERE file=?",(ftime,hash,fsize,mime,filename)) #print "changing: %(f)s " % {'f':filename} else: db.execute("INSERT INTO list(file,date,hash,size,mime)\ VALUES(?,?,?,?,?)",(filename,ftime,hash,fsize,mime)) sys.stdout.write('\r') return def checkdb(options): needle=options.search if len(needle)==0: needle.append('%') needle=['%'+i+'%' for i in needle] like_query=' OR '.join(['file LIKE ?' for i in needle]) conn=sqlite3.connect(options.sqlfile) conn.text_factory=str db=conn.cursor() db.execute("SELECT file,hash,size,date FROM list WHERE "+like_query+" ORDER BY file",needle) missing=[] differing=[] OK_count=0 for row in db: status='OK' sys.stdout.write("\r%s"%(row[0],)) if os.path.exists(row[0]): md5f=get_md5(row[0],options.fullfile) if row[1]!=md5f: status='Checksum-difference' differing.append(row) else: status='Not-found' missing.append(row) sys.stdout.write("\r%s %s\n"%(row[0],status)) if status=='OK': OK_count+=1 if len(differing)>0: print_stderr("----\nDiffering files:") pad=str(max([len(x[0]) for x in differing])) for f in differing: print(("%-"+pad+"s (%s %7s => %s %7s)")%(f[0],humanize_date(f[3]),humanize_size(f[2]), humanize_date(os.path.getmtime(f[0])), humanize_size(os.path.getsize(f[0])))) if len(missing)>0: print("----\nMissing files:") pad=str(max([len(x[0]) for x in missing])) for f in missing: print(("%-"+pad+"s (%s %7s)")%(f[0],humanize_date(f[3]),humanize_size(f[2]))) (added,changed)=has_changes_additions(db,options,False) if len(added)>0: print("----\nAdded files:") pad=str(max([len(x[0]) for x in added])) for f in added: print(("%-"+pad+"s (%s %7s)")%(f, humanize_date(os.path.getmtime(f)), humanize_size(os.path.getsize(f)))) print("----\nFile check summary:") print("Database modified: %s"%(humanize_date(os.path.getmtime(options.sqlfile)),)) print("Checksum matches : %d"%(OK_count,)) print("Checksum mismatch: %d"%(len(differing),)) print("Files missing : %d"%(len(missing),)) print("Files added : %d"%(len(added),)) def clean_dirs(dirs): for s in dirs[:]: if (s in BADDIRS) or (s.startswith(".")): dirs.remove(s) return dirs def clean_syms(files,path): nonsyms=[] for f in files: if not os.path.islink(os.path.join(path,f)): nonsyms.append(f) return nonsyms def createdb(options): conn=sqlite3.connect(options.sqlfile) db=conn.cursor() conn.text_factory=str db.execute('CREATE TABLE list (id INTEGER PRIMARY KEY AUTOINCREMENT,\ file TEXT,date INTEGER, hash TEXT,\ size INTEGER, mime TEXT)') db.execute('CREATE TABLE config (id INTEGER PRIMARY KEY AUTOINCREMENT,\ object TEXT)') conn.commit() config = configparser.RawConfigParser() config.add_section("General") config.set("General","Relative",str(options.relative)) config.set("General","FullFile",str(options.fullfile)) store=io.StringIO() config.write(store) db.execute("INSERT INTO config (object) values (?)",(store.getvalue(),)) conn.commit() return def delete_nonexisting(sqlfile,options): conn=sqlite3.connect(sqlfile) conn.text_factory=str db=conn.cursor() dbdel=conn.cursor() db.execute('SELECT file FROM list') for row in db: if os.path.exists(row[0]): delete=False if not options.symlinks: if os.path.islink(row[0]): delete=True else: delete=True if delete: print('removing.. '+row[0]) dbdel.execute("DELETE FROM list where file == ?",(row[0],)) conn.commit() return def disk_used(options): conn=sqlite3.connect(options.sqlfile) conn.text_factory=str db=conn.cursor() checkpath=filename_join(options.diskused,"",options)+"/" if checkpath=="./": checkpath="" db.execute('SELECT size,replace(file,?,"") as path FROM list WHERE file LIKE ?', (checkpath, checkpath+"%", )) entries=[] sizes=[] for row in db: start_path=row[1].split('/') start_path="/".join(start_path[0:int(options.diskused_depth)]) if start_path not in entries: entries.append(start_path) sizes.append(row[0]) else: sizes[ entries.index(start_path) ]+=row[0] for entry in zip(sizes,entries): print("| ".join([ str(entry[0]).ljust(14), humanize_size(entry[0]).rjust(8), entry[1]])) def filename_join(path,name,options): filename=os.path.realpath(os.path.join(path,name)) if options.relative: return os.path.relpath(filename, options.sqlpath) return filename def find_duplicates(sqlfile, order): conn=sqlite3.connect(sqlfile) conn.text_factory=str db=conn.cursor() dbh=conn.cursor() db.execute("SELECT hash,count(*) FROM list WHERE size > 0 GROUP BY hash HAVING count(*) > 1 ") duphash=[] for row in db: hash=row[0] dbh.execute("SELECT file,size,date FROM list WHERE hash = ?",(hash,)) flist=[] for row in dbh: flist.append(row) sort_by_method(flist, order) duphash.append((hash, flist)) duphash.sort(key=lambda file: file[1][0]) return duphash def ftime_match(db,filename,ftime): db.execute("SELECT date FROM list where file == ?",(filename,)) count=db.fetchall() return count[0][0]==ftime def get_folder_contents(db,path): ''' return the contents of the folder ''' files=[] if path=="./": db.execute("SELECT file FROM list where file NOT LIKE ?",('%/%',)) path="" else: db.execute("SELECT file FROM list where file LIKE ?",(path+'%',)) for row in db: try: base=row[0].replace(path,'',1) except UnicodeDecodeError: print(row[0]+" is giving me trouble.") try: base=row[0].encode('utf-8').replace(path,'',1) except UnicodeDecodeError: print(row[0]+" is still giving me trouble.") sys.exit(1) if base.find('/')==-1: files.append(base) return files def get_md5(filename,fullfile=False): ''' returns content based hash, only first 50Mb is read, unless user wants the whole file ''' fsize=os.path.getsize(filename) if fullfile and fsize>DEFAULT_CHUNK: anim_i=0 anim_len=len(ANIM) block_size=2**24 percents_per_block=int(100/(float(fsize)/block_size)) md5 = hashlib.md5() with open(filename,'rb') as f: for chunk in iter(lambda: f.read(block_size), b''): sys.stderr.write('\r %s (%02d%%)'%(ANIM[anim_i%anim_len],int(anim_i*percents_per_block))) sys.stderr.flush() anim_i+=1 md5.update(chunk) sys.stderr.write('\r ') return md5.hexdigest() return hashlib.md5(open(filename,'rb').read(DEFAULT_CHUNK)).hexdigest() def has_changes(options): conn=sqlite3.connect(options.sqlfile) conn.text_factory=str db=conn.cursor() if options.haschanges: options.changed=True if options.hasdeletions or options.haschanges: has_changes_deleted(db) if options.hasadditions or options.haschanges: has_changes_additions(db,options) def has_changes_deleted(db,exit=True): db.execute('SELECT file FROM list') deleted=[] for row in db: if not os.path.exists(row[0]): if exit: print('True') sys.exit(1) else: deleted.append(row[0]) return deleted def has_changes_additions(db,options,exit=True): added=[] changed=[] for path,dirs,files in os.walk(options.startpath,followlinks=options.symlinks): dirs=clean_dirs(dirs) db_files=get_folder_contents(db,filename_join(path,"",options)+"/") if not options.symlinks: files=clean_syms(files,path) for file in files: filename=filename_join(path,file,options) if file==options.sqlfile: continue #if not is_listed(db,filename): if file not in db_files: if exit: print('True') sys.exit(1) else: added.append(filename) else: if options.changed: ftime=os.path.getmtime(filename) if not ftime_match(db,filename,ftime): #file content changed if exit: print('True') sys.exit(1) else: changed.append(filename) return (added,changed) #~ def hash_match(db,filename,hash): #~ db.execute("SELECT hash FROM list where file == ?",(filename,)) #~ count=db.fetchall() #~ return count[0][0]==hash def humanize_date(date): if date==None: return '' return datetime.datetime.fromtimestamp(int(date)).strftime('%Y-%m-%d %H:%M:%S') def humanize_size(size,precision=1): if size==None: return 'nan' suffixes=['B','KB','MB','GB','TB'] suffixIndex = 0 defPrecision=0 while size > 1024: suffixIndex += 1 #increment the index of the suffix size = float(size/1024.0) #apply the division defPrecision=precision return "%.*f%s"%(defPrecision,size,suffixes[suffixIndex]) def is_listed(db,filename): db.execute("SELECT COUNT(*) FROM list where file == ?",(filename,)) count=db.fetchall() return count[0][0]>0 def matchdb(sqlfile,needle,helper): needle=needle.lower() import difflib as dl conn=sqlite3.connect(sqlfile) conn.text_factory=str db=conn.cursor() if len(helper)>0: helper=['%'+i+'%' for i in helper] like_query=' OR '.join(['file LIKE ?' for i in helper]) db.execute("SELECT file FROM list WHERE "+like_query+" ORDER BY date DESC",helper) else: db.execute("SELECT file FROM list ORDER BY date DESC") ratio=0 best_match="" for row in db: s=dl.SequenceMatcher(None, os.path.basename(row[0]).lower(), needle) s_ratio=s.ratio() if ratio < s_ratio: ratio=s_ratio best_match=row[0] print(best_match) def print_duplicates(files): for hash in files: #print(hash[0]) i=1 for f in hash[1]: print("%(i)d|%(s)s|%(d)s|%(f)s " % { 'i':i, 'f':f[0], 'd': humanize_date(f[2]), 's': humanize_size(f[1]) }) i+=1 return def print_stderr(s): sys.stderr.write(s) sys.stderr.write("\n") sys.stderr.flush() def searchdb(sqlfile,needle): needle=['%'+i+'%' for i in needle] like_query=' OR '.join(['file LIKE ?' for i in needle]) conn=sqlite3.connect(sqlfile) conn.text_factory=str db=conn.cursor() db.execute("SELECT file FROM list WHERE "+like_query+" ORDER BY file",needle) for row in db: print(row[0]) def sort_by_method(flist, order): if order == 'path': flist.sort(key=lambda file: file[0]) if order == 'file': flist.sort(key=lambda file: os.path.basename(file[0])) if order == 'age': flist.sort(key=lambda file: file[2]) if order == 'length': flist.sort(key=lambda file: len(file[0])) def stored_options(options): try: conn=sqlite3.connect(options.sqlfile) db=conn.cursor() conn.text_factory=str db.execute("SELECT object FROM config") store="" for row in db: store+=row[0]+'\n' config = configparser.RawConfigParser() config.readfp(io.BytesIO(store)) options.relative=config.getboolean("General","Relative") options.fullfile=config.getboolean("General","FullFile") except: pass return options def main(): options=setup_options(); if not os.path.exists(options.sqlfile): createdb(options); options=stored_options(options) if options.relative: os.chdir(options.sqlpath) if options.haschanges or options.hasadditions or options.hasdeletions: has_changes(options) sys.exit(0) if options.check: checkdb(options) sys.exit(0) if len(options.search)>0 and not options.match: searchdb(options.sqlfile,options.search) sys.exit(0) if options.match: matchdb(options.sqlfile,options.match,options.search) sys.exit(0) if options.diskused: disk_used(options) sys.exit(0) if options.delete: print('Deleting entries...') delete_nonexisting(options.sqlfile,options) if options.add or options.changed: print('Adding '+options.startpath+' entries...') add_recurse(options) if options.duplicate: files=find_duplicates(options.sqlfile, options.duplicate_order) print_duplicates(files) sys.exit(0) main()