#!/usr/bin/python import sys import os import re import sqlite3 import subprocess import hashlib import magic from argparse import ArgumentParser SQLFILE='list_of_files.sqlite' IMGMATCH=re.compile('.*\.jpg$|.*\.jpeg$|.*\.png$',re.I) BADDIRS=[] MINSIZE=0 MIME=magic.open(magic.MAGIC_NONE) #MIME=magic.open(magic.MAGIC_MIME) MIME.load() def setup_options(): parser=ArgumentParser(description="Maintains the list of images sqlite file") parser.add_argument("-a",action="store_false",dest="add",default=True, help="Do not add new files [%(default)s]") parser.add_argument("-c",action="store_true",dest="changed",default=False, help="Modify changed files [%(default)s]") parser.add_argument("-d",action="store_true",dest="delete",default=False, help="Delete non-existing entries [%(default)s]") parser.add_argument("--haschanges",action="store_true",dest="haschanges",default=False, help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.") parser.add_argument("--hasdeletions",action="store_true",dest="hasdeletions",default=False, help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.") parser.add_argument("--hasadditions",action="store_true",dest="hasadditions",default=False, help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.") parser.add_argument("-l",action="store_true",dest="symlinks",default=False, help="Follow symbolic links [%(default)s]") parser.add_argument("--dup",action="store_true",dest="duplicate",default=False, help="Return a list of duplicate files, based on hashes. This option will flip the 'Add new files' option. [%(default)s]") parser.add_argument("-x",action="append",dest="exclude",default=[], help="Exclude folder name from the lists. This option may be issued several times") parser.add_argument("-f",action="store",dest="sqlfile",default=SQLFILE, help="SQL file name to use [%(default)s]") parser.add_argument("-s",type=str,action='append',dest="search",default=[], help="Search list based on path pattern") parser.add_argument("--match",type=str,dest="match",default=False, help="Search for closest match from basenames, can be helped with adding -s") parser.add_argument('startpath', action="store",default='.', nargs='?') options=parser.parse_args() BADDIRS.extend(options.exclude) if options.duplicate: options.add=not options.add return options def createdb(fname): conn=sqlite3.connect(fname) db=conn.cursor() conn.text_factory=str db.execute('CREATE TABLE list (id INTEGER PRIMARY KEY AUTOINCREMENT,\ file TEXT,date INTEGER, hash TEXT,\ size INTEGER, mime TEXT)') conn.commit() return def delete_nonexisting(sqlfile,options): conn=sqlite3.connect(sqlfile) conn.text_factory=str db=conn.cursor() dbdel=conn.cursor() db.execute('SELECT file FROM list') for row in db: if os.path.exists(row[0]): delete=False if not options.symlinks: if os.path.islink(row[0]): delete=True else: delete=True if delete: print('removing.. '+row[0]) dbdel.execute("DELETE FROM list where file == ?",(row[0],)) conn.commit() return def has_changes(options): conn=sqlite3.connect(options.sqlfile) conn.text_factory=str db=conn.cursor() if options.haschanges: options.changed=True if options.hasdeletions or options.haschanges: has_changes_deleted(db) if options.hasadditions or options.haschanges: has_changes_additions(db,options) def has_changes_deleted(db): db.execute('SELECT file FROM list') for row in db: if not os.path.exists(row[0]): print('True') sys.exit(1) return def has_changes_additions(db,options): for path,dirs,files in os.walk(options.startpath,followlinks=options.symlinks): dirs=clean_dirs(dirs) db_files=get_folder_contents(db,os.path.abspath(path)+'/') if not options.symlinks: files=clean_syms(files,path) for file in files: filename=os.path.abspath(os.path.join(path,file)) if file==options.sqlfile: continue #if not is_listed(db,filename): if file not in db_files: print('True') sys.exit(1) else: if options.changed: ftime=os.path.getmtime(filename) if not ftime_match(db,filename,ftime): #file content changed print('True') sys.exit(1) return def add_recurse(options): conn=sqlite3.connect(options.sqlfile) conn.text_factory=str db=conn.cursor() for path,dirs,files in os.walk(options.startpath,followlinks=options.symlinks): dirs=clean_dirs(dirs) dirs.sort() files.sort() db_files=get_folder_contents(db,os.path.abspath(path)+'/') if not options.symlinks: files=clean_syms(files,path) for file in files: filename=os.path.abspath(os.path.join(path,file)) if file==options.sqlfile: continue #if not is_listed(db,filename): if file not in db_files: if options.add: add_single(conn,filename,change=False) else: if options.changed: ftime=os.path.getmtime(filename) if not ftime_match(db,filename,ftime): #file content changed add_single(conn,filename,change=True) conn.commit() return def add_single(conn,filename,change=False,hash=None,minsize=0): db=conn.cursor() print "%(f)s" % {'f':filename} if hash==None: hash=get_md5(filename) ftime=os.path.getmtime(filename) fsize=os.path.getsize(filename) mime=MIME.file(filename.decode('utf-8')) if change: db.execute("UPDATE list SET date=?, hash=?, size=?, mime=? \ WHERE file=?",(ftime,hash,fsize,mime,filename)) #print "changing: %(f)s " % {'f':filename} else: db.execute("INSERT INTO list(file,date,hash,size,mime)\ VALUES(?,?,?,?,?)",(filename,ftime,hash,fsize,mime)) return def is_listed(db,filename): db.execute("SELECT COUNT(*) FROM list where file == ?",(filename,)) count=db.fetchall() return count[0][0]>0 def get_folder_contents(db,path): ''' return the contents of the folder ''' files=[] db.execute("SELECT file FROM list where file LIKE ?",(path+'%',)) for row in db: base=row[0].replace(path,'',1) if base.find('/')==-1: files.append(base) return files def ftime_match(db,filename,ftime): db.execute("SELECT date FROM list where file == ?",(filename,)) count=db.fetchall() return count[0][0]==ftime def hash_match(db,filename,hash): db.execute("SELECT hash FROM list where file == ?",(filename,)) count=db.fetchall() return count[0][0]==hash def get_md5(filename): ''' returns content based hash, only first 50Mb is read ''' return hashlib.md5(open(filename,'rb').read(1024*1024*50)).hexdigest() def clean_dirs(dirs): for s in dirs[:]: if (s in BADDIRS) or (s.startswith(".")): dirs.remove(s) return dirs def clean_syms(files,path): nonsyms=[] for f in files: if not os.path.islink(os.path.join(path,f)): nonsyms.append(f) return nonsyms def find_duplicates(sqlfile): conn=sqlite3.connect(sqlfile) conn.text_factory=str db=conn.cursor() dbh=conn.cursor() db.execute("SELECT hash,count(*) FROM list group by hash HAVING count(*) > 1 ") duphash=[] for row in db: hash=row[0] dbh.execute("SELECT file,size,date FROM list WHERE hash = ?",(hash,)) flist=[] for row in dbh: flist.append(row) flist.sort(key=lambda file: file[0]) duphash.append((hash, flist)) duphash.sort(key=lambda file: file[1][0]) return duphash def searchdb(sqlfile,needle): needle=['%'+i+'%' for i in needle] like_query=' OR '.join(['file LIKE ?' for i in needle]) conn=sqlite3.connect(sqlfile) conn.text_factory=str db=conn.cursor() db.execute("SELECT file FROM list WHERE "+like_query+" ORDER BY file",needle) for row in db: print(row[0]) def matchdb(sqlfile,needle,helper): import difflib as dl conn=sqlite3.connect(sqlfile) conn.text_factory=str db=conn.cursor() if len(helper)>0: helper=['%'+i+'%' for i in helper] like_query=' OR '.join(['file LIKE ?' for i in helper]) db.execute("SELECT file FROM list WHERE "+like_query+" ORDER BY date DESC",helper) else: db.execute("SELECT file FROM list ORDER BY date DESC") ratio=0 best_match="" for row in db: s=dl.SequenceMatcher(None, os.path.basename(row[0]), needle) s_ratio=s.ratio() if ratio < s_ratio: ratio=s_ratio best_match=row[0] print(best_match) def print_structure(files): for hash in files: #print(hash[0]) i=1 for f in hash[1]: print "%(i)d: %(x)d:%(f)s " % {'i':i, 'f':f[0], 'x':f[1]} i+=1 return def main(): options=setup_options(); if not os.path.exists(options.sqlfile): createdb(options.sqlfile); if options.haschanges or options.hasadditions or options.hasdeletions: has_changes(options) sys.exit(0) if len(options.search)>0 and not options.match: searchdb(options.sqlfile,options.search) sys.exit(0) if options.match: matchdb(options.sqlfile,options.match,options.search) sys.exit(0) if options.delete: print('Deleting entries...') delete_nonexisting(options.sqlfile,options) if options.add or options.changed: print('Adding '+options.startpath+' entries...') add_recurse(options) if options.duplicate: files=find_duplicates(options.sqlfile) print_structure(files) sys.exit(0) main()