#!/usr/bin/env python # -*- coding: latin-1 -*- import sys import os import re import sqlite3 import subprocess import hashlib import magic from argparse import ArgumentParser import ConfigParser,StringIO,io import datetime SQLFILE='list_of_files.sqlite' IMGMATCH=re.compile('.*\.jpg$|.*\.jpeg$|.*\.png$',re.I) BADDIRS=[] MINSIZE=0 MIME=magic.open(magic.MAGIC_NONE) #MIME=magic.open(magic.MAGIC_MIME) MIME.load() ANIM=['.','·',"'","'",'·','.','_'] DEFAULT_CHUNK=1024*1024*50 def setup_options(): parser=ArgumentParser(description="Maintains the list of images sqlite file") parser.add_argument("-a",action="store_false",dest="add",default=True, help="Do not add new files [%(default)s]") parser.add_argument("-c",action="store_true",dest="changed",default=False, help="Modify changed files [%(default)s]") parser.add_argument("--check",action="store_true",dest="check",default=False, help="Check md5sums of files. Limit check with -s.") parser.add_argument("-d",action="store_true",dest="delete",default=False, help="Delete non-existing entries [%(default)s]") parser.add_argument("--du",type=str,action='store',dest="diskused",default=False, help="Print directory sizes. Argument is the path where directories are listed from.") parser.add_argument("--du-depth",type=str,action='store',dest="diskused_depth",default=1, help="Depth of summarization for --du.") parser.add_argument("--dup",action="store_true",dest="duplicate",default=False, help="Return a list of duplicate files, based on hashes. This option will flip the 'Add new files' option. [%(default)s]") parser.add_argument("--haschanges",action="store_true",dest="haschanges",default=False, help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.") parser.add_argument("--hasdeletions",action="store_true",dest="hasdeletions",default=False, help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.") parser.add_argument("--hasadditions",action="store_true",dest="hasadditions",default=False, help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.") parser.add_argument("-f",action="store",dest="sqlfile",default=SQLFILE, help="SQL file name to use [%(default)s]") parser.add_argument("-l",action="store_true",dest="symlinks",default=False, help="Follow symbolic links [%(default)s]") parser.add_argument("--match",type=str,dest="match",default=False, help="Search for closest match from basenames, can be helped with adding -s") parser.add_argument("-s",type=str,action='append',dest="search",default=[], help="Search list based on path pattern") parser.add_argument("-x",action="append",dest="exclude",default=[], help="Exclude folder name from the lists. This option may be issued several times") parser.add_argument("--full",action="store_true",dest="fullfile",default=False, help="ONLY FOR NEW DB CREATION. Use full files to calculate md5 checksum. Defaults to first 50Mb. [%(default)s]") parser.add_argument("--relative",action="store_true",dest="relative",default=False, help="ONLY FOR NEW DB CREATION. Store filenames relative to database file.") parser.add_argument('startpath', action="store",default='.', nargs='?') options=parser.parse_args() BADDIRS.extend(options.exclude) if options.duplicate: options.add=not options.add options.startpath=unicode(options.startpath, "UTF-8") options.sqlpath=os.path.dirname(os.path.realpath(options.sqlfile)) return options def add_recurse(options): conn=sqlite3.connect(options.sqlfile) conn.text_factory=str db=conn.cursor() prev_path_len=0 for path,dirs,files in os.walk(options.startpath,followlinks=options.symlinks): sys.stdout.write(("\r%s%s"%(filename_join(path,".",options),(prev_path_len-len(path))*' ')).encode('utf-8')) prev_path_len=len(path) dirs=clean_dirs(dirs) dirs.sort() files.sort() db_files=get_folder_contents(db,filename_join(path,"",options)+"/") if not options.symlinks: files=clean_syms(files,path) for file in files: filename=filename_join(path,file,options) if file==options.sqlfile: continue if not os.path.isfile(filename): continue #if not is_listed(db,filename): if file not in db_files: if options.add: add_single(conn,filename,change=False,fullfile=options.fullfile) else: if options.changed: ftime=os.path.getmtime(filename) if not ftime_match(db,filename,ftime): #file content changed add_single(conn,filename,change=True,fullfile=options.fullfile) conn.commit() sys.stdout.write("\n") return def add_single(conn,filename,change=False,hash=None,minsize=0,fullfile=False): try: fsize=os.path.getsize(filename) hsize=humanize_size(fsize) except IOError: hsize="" print("\r%s (%s)"%(filename,hsize)) db=conn.cursor() try: if hash==None: hash=get_md5(filename,fullfile) ftime=os.path.getmtime(filename) mime=MIME.file(filename.encode('UTF-8')) except IOError: print("File '%s' not found. Bad link?"%(filename,)) return except UnicodeDecodeError: mime="NA" if change: db.execute("UPDATE list SET date=?, hash=?, size=?, mime=? \ WHERE file=?",(ftime,hash,fsize,mime,filename)) #print "changing: %(f)s " % {'f':filename} else: db.execute("INSERT INTO list(file,date,hash,size,mime)\ VALUES(?,?,?,?,?)",(filename,ftime,hash,fsize,mime)) sys.stdout.write('\r') return def checkdb(options): needle=options.search if len(needle)==0: needle.append('%') needle=['%'+i+'%' for i in needle] like_query=' OR '.join(['file LIKE ?' for i in needle]) conn=sqlite3.connect(options.sqlfile) conn.text_factory=str db=conn.cursor() db.execute("SELECT file,hash,size,date FROM list WHERE "+like_query+" ORDER BY file",needle) missing=[] differing=[] OK_count=0 for row in db: status='OK' sys.stdout.write("\r%s"%(row[0],)) if os.path.exists(row[0]): md5f=get_md5(row[0],options.fullfile) if row[1]!=md5f: status='Checksum-difference' differing.append(row) else: status='Not-found' missing.append(row) sys.stdout.write("\r%s %s\n"%(row[0],status)) if status=='OK': OK_count+=1 if len(differing)>0: print_stderr("----\nDiffering files:") pad=str(max([len(x[0]) for x in differing])) for f in differing: print(("%-"+pad+"s (%s %7s => %s %7s)")%(f[0],humanize_date(f[3]),humanize_size(f[2]), humanize_date(os.path.getmtime(f[0])), humanize_size(os.path.getsize(f[0])))) if len(missing)>0: print("----\nMissing files:") pad=str(max([len(x[0]) for x in missing])) for f in missing: print(("%-"+pad+"s (%s %7s)")%(f[0],humanize_date(f[3]),humanize_size(f[2]))) (added,changed)=has_changes_additions(db,options,False) if len(added)>0: print("----\nAdded files:") pad=str(max([len(x[0]) for x in added])) for f in added: print(("%-"+pad+"s (%s %7s)")%(f, humanize_date(os.path.getmtime(f)), humanize_size(os.path.getsize(f)))) print("----\nFile check summary:") print("Database modified: %s"%(humanize_date(os.path.getmtime(options.sqlfile)),)) print("Checksum matches : %d"%(OK_count,)) print("Checksum mismatch: %d"%(len(differing),)) print("Files missing : %d"%(len(missing),)) print("Files added : %d"%(len(added),)) def clean_dirs(dirs): for s in dirs[:]: if (s in BADDIRS) or (s.startswith(".")): dirs.remove(s) return dirs def clean_syms(files,path): nonsyms=[] for f in files: if not os.path.islink(os.path.join(path,f)): nonsyms.append(f) return nonsyms def createdb(options): conn=sqlite3.connect(options.sqlfile) db=conn.cursor() conn.text_factory=str db.execute('CREATE TABLE list (id INTEGER PRIMARY KEY AUTOINCREMENT,\ file TEXT,date INTEGER, hash TEXT,\ size INTEGER, mime TEXT)') db.execute('CREATE TABLE config (id INTEGER PRIMARY KEY AUTOINCREMENT,\ object TEXT)') conn.commit() config = ConfigParser.RawConfigParser() config.add_section("General") config.set("General","Relative",str(options.relative)) config.set("General","FullFile",str(options.fullfile)) store=StringIO.StringIO() config.write(store) db.execute("INSERT INTO config (object) values (?)",(store.getvalue(),)) conn.commit() return def delete_nonexisting(sqlfile,options): conn=sqlite3.connect(sqlfile) conn.text_factory=str db=conn.cursor() dbdel=conn.cursor() db.execute('SELECT file FROM list') for row in db: if os.path.exists(row[0]): delete=False if not options.symlinks: if os.path.islink(row[0]): delete=True else: delete=True if delete: print('removing.. '+row[0]) dbdel.execute("DELETE FROM list where file == ?",(row[0],)) conn.commit() return def disk_used(options): conn=sqlite3.connect(options.sqlfile) conn.text_factory=str db=conn.cursor() checkpath=filename_join(options.diskused,"",options)+"/" if checkpath=="./": checkpath="" db.execute('SELECT size,replace(file,?,"") as path FROM list WHERE file LIKE ?', (checkpath, checkpath+"%", )) entries=[] sizes=[] for row in db: start_path=row[1].split('/') start_path="/".join(start_path[0:int(options.diskused_depth)]) if start_path not in entries: entries.append(start_path) sizes.append(row[0]) else: sizes[ entries.index(start_path) ]+=row[0] for entry in zip(sizes,entries): print("| ".join([ str(entry[0]).ljust(14), humanize_size(entry[0]).rjust(8), entry[1]])) def filename_join(path,name,options): filename=os.path.realpath(os.path.join(path,name)) if options.relative: return os.path.relpath(filename, options.sqlpath) return filename def find_duplicates(sqlfile): conn=sqlite3.connect(sqlfile) conn.text_factory=str db=conn.cursor() dbh=conn.cursor() db.execute("SELECT hash,count(*) FROM list WHERE size > 0 GROUP BY hash HAVING count(*) > 1 ") duphash=[] for row in db: hash=row[0] dbh.execute("SELECT file,size,date FROM list WHERE hash = ?",(hash,)) flist=[] for row in dbh: flist.append(row) flist.sort(key=lambda file: file[0]) duphash.append((hash, flist)) duphash.sort(key=lambda file: file[1][0]) return duphash def ftime_match(db,filename,ftime): db.execute("SELECT date FROM list where file == ?",(filename,)) count=db.fetchall() return count[0][0]==ftime def get_folder_contents(db,path): ''' return the contents of the folder ''' files=[] if path=="./": db.execute("SELECT file FROM list where file NOT LIKE ?",('%/%',)) path="" else: db.execute("SELECT file FROM list where file LIKE ?",(path+'%',)) for row in db: try: base=row[0].decode('utf-8').replace(path,'',1) except UnicodeDecodeError: print(row[0]+" is giving me trouble.") try: base=row[0].encode('utf-8').replace(path,'',1) except UnicodeDecodeError: print(row[0]+" is still giving me trouble.") sys.exit(1) if base.find('/')==-1: files.append(base) return files def get_md5(filename,fullfile=False): ''' returns content based hash, only first 50Mb is read, unless user wants the whole file ''' fsize=os.path.getsize(filename) if fullfile and fsize>DEFAULT_CHUNK: anim_i=0 anim_len=len(ANIM) block_size=2**24 percents_per_block=100/(float(fsize)/block_size) md5 = hashlib.md5() with open(filename,'rb') as f: for chunk in iter(lambda: f.read(block_size), b''): sys.stderr.write('\r %s (%02d%%)'%(ANIM[anim_i%anim_len],int(anim_i*percents_per_block))) sys.stderr.flush() anim_i+=1 md5.update(chunk) sys.stderr.write('\r ') return md5.hexdigest() return hashlib.md5(open(filename,'rb').read(DEFAULT_CHUNK)).hexdigest() def has_changes(options): conn=sqlite3.connect(options.sqlfile) conn.text_factory=str db=conn.cursor() if options.haschanges: options.changed=True if options.hasdeletions or options.haschanges: has_changes_deleted(db) if options.hasadditions or options.haschanges: has_changes_additions(db,options) def has_changes_deleted(db,exit=True): db.execute('SELECT file FROM list') deleted=[] for row in db: if not os.path.exists(row[0]): if exit: print('True') sys.exit(1) else: deleted.append(row[0]) return deleted def has_changes_additions(db,options,exit=True): added=[] changed=[] for path,dirs,files in os.walk(options.startpath,followlinks=options.symlinks): dirs=clean_dirs(dirs) db_files=get_folder_contents(db,filename_join(path,"",options)+"/") if not options.symlinks: files=clean_syms(files,path) for file in files: filename=filename_join(path,file,options) if file==options.sqlfile: continue #if not is_listed(db,filename): if file not in db_files: if exit: print('True') sys.exit(1) else: added.append(filename) else: if options.changed: ftime=os.path.getmtime(filename) if not ftime_match(db,filename,ftime): #file content changed if exit: print('True') sys.exit(1) else: changed.append(filename) return (added,changed) #~ def hash_match(db,filename,hash): #~ db.execute("SELECT hash FROM list where file == ?",(filename,)) #~ count=db.fetchall() #~ return count[0][0]==hash def humanize_date(date): if date==None: return '' return datetime.datetime.fromtimestamp(int(date)).strftime('%Y-%m-%d %H:%M:%S') def humanize_size(size,precision=1): if size==None: return 'nan' suffixes=['B','KB','MB','GB','TB'] suffixIndex = 0 defPrecision=0 while size > 1024: suffixIndex += 1 #increment the index of the suffix size = size/1024.0 #apply the division defPrecision=precision return "%.*f%s"%(defPrecision,size,suffixes[suffixIndex]) def is_listed(db,filename): db.execute("SELECT COUNT(*) FROM list where file == ?",(filename,)) count=db.fetchall() return count[0][0]>0 def matchdb(sqlfile,needle,helper): needle=needle.lower() import difflib as dl conn=sqlite3.connect(sqlfile) conn.text_factory=str db=conn.cursor() if len(helper)>0: helper=['%'+i+'%' for i in helper] like_query=' OR '.join(['file LIKE ?' for i in helper]) db.execute("SELECT file FROM list WHERE "+like_query+" ORDER BY date DESC",helper) else: db.execute("SELECT file FROM list ORDER BY date DESC") ratio=0 best_match="" for row in db: s=dl.SequenceMatcher(None, os.path.basename(row[0]).lower(), needle) s_ratio=s.ratio() if ratio < s_ratio: ratio=s_ratio best_match=row[0] print(best_match) def print_structure(files): for hash in files: #print(hash[0]) i=1 for f in hash[1]: print "%(i)d: %(x)d:%(f)s " % {'i':i, 'f':f[0], 'x':f[1]} i+=1 return def print_stderr(s): sys.stderr.write(s) sys.stderr.write("\n") sys.stderr.flush() def searchdb(sqlfile,needle): needle=['%'+i+'%' for i in needle] like_query=' OR '.join(['file LIKE ?' for i in needle]) conn=sqlite3.connect(sqlfile) conn.text_factory=str db=conn.cursor() db.execute("SELECT file FROM list WHERE "+like_query+" ORDER BY file",needle) for row in db: print(row[0]) def stored_options(options): try: conn=sqlite3.connect(options.sqlfile) db=conn.cursor() conn.text_factory=str db.execute("SELECT object FROM config") store="" for row in db: store+=row[0]+'\n' config = ConfigParser.RawConfigParser() config.readfp(io.BytesIO(store)) options.relative=config.getboolean("General","Relative") options.fullfile=config.getboolean("General","FullFile") except: pass return options def main(): options=setup_options(); if not os.path.exists(options.sqlfile): createdb(options); options=stored_options(options) if options.relative: os.chdir(options.sqlpath) if options.haschanges or options.hasadditions or options.hasdeletions: has_changes(options) sys.exit(0) if options.check: checkdb(options) sys.exit(0) if len(options.search)>0 and not options.match: searchdb(options.sqlfile,options.search) sys.exit(0) if options.match: matchdb(options.sqlfile,options.match,options.search) sys.exit(0) if options.diskused: disk_used(options) sys.exit(0) if options.delete: print('Deleting entries...') delete_nonexisting(options.sqlfile,options) if options.add or options.changed: print('Adding '+options.startpath+' entries...') add_recurse(options) if options.duplicate: files=find_duplicates(options.sqlfile) print_structure(files) sys.exit(0) main()