#!/usr/bin/python import sys import os import re import sqlite3 import subprocess import hashlib from argparse import ArgumentParser SQLFILE='list_of_images.sqlite' IMGMATCH=re.compile('.*\.jpg$|.*\.jpeg$|.*\.png$|.*\.gif$',re.I) BADDIRS=['_tn','_med'] MINSIZE=0 def setup_options(): parser=ArgumentParser(description="Maintains the list of images sqlite file") parser.add_argument("-a",action="store_false",dest="add",default=True, help="Do not add new files [%(default)s]") parser.add_argument("-c",action="store_true",dest="changed",default=False, help="Modify changed files [%(default)s]") parser.add_argument("-d",action="store_true",dest="delete",default=False, help="Delete non-existing entries [%(default)s]") parser.add_argument("-f",action="store",dest="sqlfile",default=SQLFILE, help="SQL file name to use [%(default)s]") parser.add_argument("-l",action="store_true",dest="symlinks",default=False, help="Follow symbolic links [%(default)s]") parser.add_argument("-m",type=int,dest="minsize",default=MINSIZE, help="Minimum pixel width/height of stored image [%(default)s]") parser.add_argument("-r",action="store_true",dest="random",default=False, help="Create randomized files for landscape and portrait images [%(default)s]") parser.add_argument("-s",type=str,dest="search",default=False, help="Search list based on path pattern") parser.add_argument("--measure",action="store_true",dest="measure",default=False, help="Measure various statistics for similarity/color searches. This option will flip the 'Add new files' option. [%(default)s]") parser.add_argument("--nearest",type=str,dest="nearestcolor",default=False, help="Search list for nearest mean color. format: R,G,B in float 0-1. Add fourth value to limit search to number") parser.add_argument("--dup",action="store_true",dest="duplicate",default=False, help="Return a list of duplicate files, based on hashes. This option will flip the 'Add new files' option. [%(default)s]") parser.add_argument("--del",action="store_true",dest="deleteFiles",default=False, help="Delete files listed with --small. [%(default)s]") parser.add_argument("--small",action="store_true",dest="searchsmall",default=False, help="Return a list of small files, smaller than -m INT. This option will flip the 'Add new files' option. [%(default)s]") parser.add_argument("--similar",type=str,dest="similarity",default=None, help="Search list for similar images. Value 0-255 for similarity threshold. 0=high similarity") parser.add_argument("-x",action="append",dest="exclude",default=[], help="Exclude folder name from the lists. This option may be issued several times") parser.add_argument('startpath', action="store",default='.', nargs='?') options=parser.parse_args() BADDIRS.extend(options.exclude) if options.duplicate or options.searchsmall or options.measure or options.nearestcolor or options.similarity!=None or options.search: options.add=not options.add return options def createdb(sqlfile): conn=sqlite3.connect(sqlfile) db=conn.cursor() conn.text_factory=str db.execute('CREATE TABLE list (id INTEGER PRIMARY KEY AUTOINCREMENT,\ file TEXT,date INTEGER,portrait NUMERIC, hash TEXT,\ width INTEGER,height INTEGER,fingerprint TEXT,\ R REAL,G REAL, B REAL, BR REAL, BG REAL, BB REAL)') conn.commit() return def delete_nonexisting(sqlfile): conn=sqlite3.connect(sqlfile) conn.text_factory=str #conn.row_factory=sqlite3.Row db=conn.cursor() dbdel=conn.cursor() db.execute('SELECT file FROM list') for row in db: if not os.path.exists(row[0]): print('removing.. '+row[0]) dbdel.execute("DELETE FROM list where file == ?",(row[0],)) conn.commit() return def delete_files(files): ''' Actually deletes files! ''' print_structure(files) doit=confirm(prompt="Sure to delete these files?") if doit: print("now delling") for hash in files: for f in hash[1]: print f[0] os.remove(f[0]) return def add_recurse(options): conn=sqlite3.connect(options.sqlfile) conn.text_factory=str db=conn.cursor() for path,dirs,files in os.walk(options.startpath,followlinks=options.symlinks): print('Checking '+path) dirs=clean_dirs(dirs) if not options.symlinks: files=clean_syms(files) files.sort() dirs.sort() db_files=get_folder_contents(db,os.path.abspath(path)+'/') for file in files: if IMGMATCH.match(file): filename=os.path.abspath(os.path.join(path,file)) #if not is_listed(db,filename): if file not in db_files: if options.add: try: add_single(conn,filename,change=False,minsize=options.minsize) except: print('error adding file: '+filename) sys.exit(1) else: if options.changed: ftime=os.path.getmtime(filename) #hash=get_md5(filename) #if not hash_match(db,filename,hash): if not ftime_match(db,filename,ftime): #file content changed try: add_single(conn,filename,change=True,minsize=options.minsize) except: print('error changing file: '+filename) sys.exit(1) # if file mentioned, and hash same, no need to change entry conn.commit() return def add_single(conn,filename,change=False,hash=None,minsize=0): dims=get_dims(filename) if int(dims[0])0 def get_folder_contents(db,path): ''' return the contents of the folder ''' files=[] db.execute("SELECT file FROM list where file LIKE ?",(path+'%',)) for row in db: base=row[0].replace(path,'',1) if base.find('/')==-1: files.append(base) return files def ftime_match(db,filename,ftime): db.execute("SELECT date FROM list where file == ?",(filename,)) count=db.fetchall() return count[0][0]==ftime def hash_match(db,filename,hash): db.execute("SELECT hash FROM list where file == ?",(filename,)) count=db.fetchall() return count[0][0]==hash def get_md5(filename): ''' Return hash of the first 5 megabytes of the file ''' return hashlib.md5(open(filename,'rb').read(1024*1024*5)).hexdigest() def get_dims(filename): idargs=['identify','-format','%wx%h',filename+'[0]'] p=subprocess.Popen(idargs,stdout=subprocess.PIPE) out, err = p.communicate() return (out.strip().split('x')) def append_colors(sqlfile): conn=sqlite3.connect(sqlfile) conn.text_factory=str db=conn.cursor() dbh=conn.cursor() db.execute("SELECT file,R FROM list WHERE R IS NULL ORDER BY file") i=0 for row in db: colors=get_colors(row[0]) dbh.execute("UPDATE list SET R=?, G=?, B=?, BR=?, BG=?, BB=? \ WHERE file=?",(colors[0][0],colors[0][1],colors[0][2], colors[1][0],colors[1][1],colors[1][2],row[0])) print "colors: %(f)s (%(r)s %(g)s %(b)s)" % {'f':row[0], 'r':colors[0][0], 'g':colors[0][1], 'b':colors[0][2]} i+=1 if (i%50==0): conn.commit(); conn.commit() return def find_color_nearest(sqlfile,src): conn=sqlite3.connect(sqlfile) conn.text_factory=str db=conn.cursor() src=[float(i) for i in src.strip().strip('"').split(',')] if len(src)==3: src.append(1) db.execute("SELECT file, ABS(BR-?)+ABS(BG-?)+ABS(BB-?) as K,BR,BG,BB FROM list ORDER BY K LIMIT ?", (src[0],src[1],src[2],src[3])) for hit in db: print "%(f)s : D %(d).2f (RGB %(r).2f,%(g).2f,%(b).2f)" % {'f':hit[0],'d':hit[1], 'r':hit[2],'g':hit[3], 'b':hit[4]} return def get_colors(filename): small_args=['convert','-define','jpeg:size=64x64',filename+'[0]','-resize','10x10!','TEXT:-'] p=subprocess.Popen(small_args,stdout=subprocess.PIPE) img, err = p.communicate() mean_args=['convert','-','-format','"%[fx:mean.r],%[fx:mean.g],%[fx:mean.b]"','info:-'] p=subprocess.Popen(mean_args,stdout=subprocess.PIPE,stdin=subprocess.PIPE) mean, err = p.communicate(input=img) mean_args=['convert','-', '(','+clone','-gravity','North','-crop','10x1+0+0','-write','mpr:top','+delete',')', '(','+clone','-gravity','South','-crop','10x1+0+0','-write','mpr:bot','+delete',')', '(','+clone','-gravity','West','-crop','1x10+0+0','-rotate','90','-write','mpr:lef','+delete',')', '(','+clone','-gravity','East','-crop','1x10+0+0','-rotate','90','-write','mpr:rig','+delete',')', '+delete','mpr:top','mpr:bot','mpr:lef','mpr:rig','+append', '-format','"%[fx:mean.r],%[fx:mean.g],%[fx:mean.b]"','info:-'] p=subprocess.Popen(mean_args,stdout=subprocess.PIPE,stdin=subprocess.PIPE) border, err = p.communicate(input=img) mean=[float(i) for i in mean.strip().strip('"').split(',')] border=[float(i) for i in border.strip().strip('"').split(',')] return (mean,border) def append_fingerprints(sqlfile): conn=sqlite3.connect(sqlfile) conn.text_factory=str db=conn.cursor() dbh=conn.cursor() db.execute("SELECT file FROM list WHERE fingerprint IS NULL ORDER BY file") i=0 for row in db: fp=get_fingerprint(row[0]) dbh.execute("UPDATE list SET fingerprint=? \ WHERE file=?",(fp,row[0])) print " %(f)s" % {'f':row[0]} i+=1 if (i%50==0): conn.commit(); conn.commit() def get_fingerprint(filename): small_args=['convert','-define','jpeg:size=256x256',filename+'[0]','-resize','160x160!', '-colorspace','Gray','-blur','2x2','-normalize','-equalize','-resize','16x16','-depth','1','TEXT:-'] p=subprocess.Popen(small_args,stdout=subprocess.PIPE) img, err = p.communicate() values='' for row in img.split('\n'): gray=row.split(',') if len(gray)<3: continue if gray[2]=="255": values+='1' else: values+='0' return str(int(values,2)) def find_fingerprint_similar(sqlfile,thr): ''' Find all similar images, nearest match more similar than thr ''' conn=sqlite3.connect(sqlfile) conn.text_factory=str db1=conn.cursor() db2=conn.cursor() db1.execute("SELECT file,fingerprint,width,height FROM list ORDER BY file") hits=[] for i,hit1 in enumerate(db1): if hit1[0] in hits: continue this=[hit1[0],'',sys.maxint,int(hit1[1]),hit1[2]*hit1[3]] db2.execute("SELECT file,fingerprint,width,height FROM list ORDER BY file") for hit2 in db2: if hit2[0]==this[0]: continue similarity=bin(this[3]^int(hit2[1])).count('1') if similarity 1 ",(search,)) duphash=[] for row in db: hash=row[0] dbh.execute("SELECT file,width,height,date FROM list WHERE hash = ?",(hash,)) flist=[] for row in dbh: flist.append(row) flist.sort(key=lambda file: file[3]) duphash.append((hash, flist)) duphash.sort(key=lambda file: file[1][0]) return duphash def find_smalls(minsize,sqlfile): conn=sqlite3.connect(sqlfile) conn.text_factory=str db=conn.cursor() db.execute("SELECT file,width,height FROM list WHERE width < ? OR height < ?",(minsize,minsize)) smalls=[] flist=[] for row in db: smalls.append(row) flist.append(('smalls',smalls)) return flist def print_structure(files): for hash in files: #print(hash[0]) i=1 for f in hash[1]: print "%(i)d: (%(x)dx%(y)d):%(f)s " % {'i':i, 'f':f[0], 'x':f[1], 'y':f[2]} i+=1 return def print_dup_structure(files): i=1 for hash in files: #print(hash[0]) fnames=[] for f in hash[1]: fnames.append(' "'+f[0]+'"') print "%(i)d:%(n)d:%(f)s " % {'i':i, 'n':len(fnames), 'f':",".join(fnames)} i+=1 return def main(): options=setup_options(); if not os.path.exists(options.sqlfile): createdb(options.sqlfile); if options.delete: print('Deleting entries...') delete_nonexisting(options.sqlfile) if options.add or options.changed: print('Adding entries...') add_recurse(options) if options.search: print_structure(searchdb(options.sqlfile,options.search)) if options.measure: print('Adding colors...') append_colors(options.sqlfile) print('Adding fingerprints...') append_fingerprints(options.sqlfile) if options.random: print('Random lists...') random_lists(options.sqlfile) if options.nearestcolor: find_color_nearest(options.sqlfile,options.nearestcolor) if options.similarity!=None: if os.path.exists(options.similarity): find_fingerprint_nearest(options.sqlfile,options.similarity) else: find_fingerprint_similar(options.sqlfile,int(options.similarity)) if options.duplicate: files=find_duplicates(options.sqlfile,options.startpath) print_dup_structure(files) if options.searchsmall: files=find_smalls(options.minsize,options.sqlfile) if options.deleteFiles: if len(files[0][1])>0: delete_files(files) delete_nonexisting(options.sqlfile) else: print_structure(files) #print(files) sys.exit(0) if __name__ == "__main__": main()