249 lines
8.2 KiB
Python
Executable File
249 lines
8.2 KiB
Python
Executable File
#!/usr/bin/python
|
|
import sys
|
|
import os
|
|
import re
|
|
import sqlite3
|
|
import subprocess
|
|
import hashlib
|
|
import magic
|
|
from argparse import ArgumentParser
|
|
|
|
SQLFILE='list_of_files.sqlite'
|
|
IMGMATCH=re.compile('.*\.jpg$|.*\.jpeg$|.*\.png$',re.I)
|
|
BADDIRS=[]
|
|
MINSIZE=0
|
|
MIME=magic.open(magic.MAGIC_NONE)
|
|
#MIME=magic.open(magic.MAGIC_MIME)
|
|
MIME.load()
|
|
|
|
def setup_options():
|
|
parser=ArgumentParser(description="Maintains the list of images sqlite file")
|
|
parser.add_argument("-a",action="store_false",dest="add",default=True,
|
|
help="Do not add new files [%(default)s]")
|
|
parser.add_argument("-c",action="store_true",dest="changed",default=False,
|
|
help="Modify changed files [%(default)s]")
|
|
parser.add_argument("-d",action="store_true",dest="delete",default=False,
|
|
help="Delete non-existing entries [%(default)s]")
|
|
parser.add_argument("-l",action="store_true",dest="symlinks",default=False,
|
|
help="Follow symbolic links [%(default)s]")
|
|
parser.add_argument("--dup",action="store_true",dest="duplicate",default=False,
|
|
help="Return a list of duplicate files, based on hashes. This option will flip the 'Add new files' option. [%(default)s]")
|
|
parser.add_argument("-x",action="append",dest="exclude",default=[],
|
|
help="Exclude folder name from the lists. This option may be issued several times")
|
|
parser.add_argument("-f",action="store",dest="sqlfile",default=SQLFILE,
|
|
help="SQL file name to use [%(default)s]")
|
|
parser.add_argument("-s",type=str,action='append',dest="search",default=[],
|
|
help="Search list based on path pattern")
|
|
parser.add_argument("--match",type=str,dest="match",default=False,
|
|
help="Search for closest match from basenames, can be helped with adding -s")
|
|
parser.add_argument('startpath', action="store",default='.', nargs='?')
|
|
|
|
options=parser.parse_args()
|
|
BADDIRS.extend(options.exclude)
|
|
if options.duplicate:
|
|
options.add=not options.add
|
|
return options
|
|
|
|
def createdb(fname):
|
|
conn=sqlite3.connect(fname)
|
|
db=conn.cursor()
|
|
conn.text_factory=str
|
|
db.execute('CREATE TABLE list (id INTEGER PRIMARY KEY AUTOINCREMENT,\
|
|
file TEXT,date INTEGER, hash TEXT,\
|
|
size INTEGER, mime TEXT)')
|
|
conn.commit()
|
|
return
|
|
|
|
def delete_nonexisting(sqlfile,options):
|
|
conn=sqlite3.connect(sqlfile)
|
|
conn.text_factory=str
|
|
db=conn.cursor()
|
|
dbdel=conn.cursor()
|
|
db.execute('SELECT file FROM list')
|
|
for row in db:
|
|
if os.path.exists(row[0]):
|
|
delete=False
|
|
if not options.symlinks:
|
|
if os.path.islink(row[0]):
|
|
delete=True
|
|
else:
|
|
delete=True
|
|
if delete:
|
|
print('removing.. '+row[0])
|
|
dbdel.execute("DELETE FROM list where file == ?",(row[0],))
|
|
conn.commit()
|
|
return
|
|
|
|
def add_recurse(options):
|
|
conn=sqlite3.connect(options.sqlfile)
|
|
conn.text_factory=str
|
|
db=conn.cursor()
|
|
for path,dirs,files in os.walk(options.startpath,followlinks=options.symlinks):
|
|
dirs=clean_dirs(dirs)
|
|
dirs.sort()
|
|
files.sort()
|
|
db_files=get_folder_contents(db,os.path.abspath(path)+'/')
|
|
if not options.symlinks:
|
|
files=clean_syms(files,path)
|
|
for file in files:
|
|
filename=os.path.abspath(os.path.join(path,file))
|
|
if file==options.sqlfile:
|
|
continue
|
|
#if not is_listed(db,filename):
|
|
if file not in db_files:
|
|
if options.add:
|
|
add_single(conn,filename,change=False)
|
|
else:
|
|
if options.changed:
|
|
ftime=os.path.getmtime(filename)
|
|
if not ftime_match(db,filename,ftime):
|
|
#file content changed
|
|
add_single(conn,filename,change=True)
|
|
conn.commit()
|
|
|
|
return
|
|
|
|
def add_single(conn,filename,change=False,hash=None,minsize=0):
|
|
|
|
db=conn.cursor()
|
|
print "%(f)s" % {'f':filename}
|
|
if hash==None:
|
|
hash=get_md5(filename)
|
|
ftime=os.path.getmtime(filename)
|
|
fsize=os.path.getsize(filename)
|
|
mime=MIME.file(filename)
|
|
if change:
|
|
db.execute("UPDATE list SET date=?, hash=?, size=?, mime=? \
|
|
WHERE file=?",(ftime,hash,fsize,mime,filename))
|
|
#print "changing: %(f)s " % {'f':filename}
|
|
else:
|
|
db.execute("INSERT INTO list(file,date,hash,size,mime)\
|
|
VALUES(?,?,?,?,?)",(filename,ftime,hash,fsize,mime))
|
|
return
|
|
|
|
def is_listed(db,filename):
|
|
db.execute("SELECT COUNT(*) FROM list where file == ?",(filename,))
|
|
count=db.fetchall()
|
|
return count[0][0]>0
|
|
|
|
def get_folder_contents(db,path):
|
|
''' return the contents of the folder '''
|
|
files=[]
|
|
db.execute("SELECT file FROM list where file LIKE ?",(path+'%',))
|
|
for row in db:
|
|
base=row[0].replace(path,'',1)
|
|
if base.find('/')==-1:
|
|
files.append(base)
|
|
return files
|
|
|
|
def ftime_match(db,filename,ftime):
|
|
db.execute("SELECT date FROM list where file == ?",(filename,))
|
|
count=db.fetchall()
|
|
return count[0][0]==ftime
|
|
|
|
def hash_match(db,filename,hash):
|
|
db.execute("SELECT hash FROM list where file == ?",(filename,))
|
|
count=db.fetchall()
|
|
return count[0][0]==hash
|
|
|
|
def get_md5(filename):
|
|
''' returns content based hash, only first 50Mb is read '''
|
|
return hashlib.md5(open(filename,'rb').read(1024*1024*50)).hexdigest()
|
|
|
|
def clean_dirs(dirs):
|
|
for s in BADDIRS:
|
|
if s in dirs:
|
|
dirs.remove(s)
|
|
return dirs
|
|
|
|
def clean_syms(files,path):
|
|
nonsyms=[]
|
|
for f in files:
|
|
if not os.path.islink(os.path.join(path,f)):
|
|
nonsyms.append(f)
|
|
return nonsyms
|
|
|
|
def find_duplicates(sqlfile):
|
|
conn=sqlite3.connect(sqlfile)
|
|
conn.text_factory=str
|
|
db=conn.cursor()
|
|
dbh=conn.cursor()
|
|
db.execute("SELECT hash,count(*) FROM list group by hash HAVING count(*) > 1 ")
|
|
duphash=[]
|
|
for row in db:
|
|
hash=row[0]
|
|
dbh.execute("SELECT file,size,date FROM list WHERE hash = ?",(hash,))
|
|
flist=[]
|
|
for row in dbh:
|
|
flist.append(row)
|
|
flist.sort(key=lambda file: file[0])
|
|
duphash.append((hash, flist))
|
|
duphash.sort(key=lambda file: file[1][0])
|
|
return duphash
|
|
|
|
def searchdb(sqlfile,needle):
|
|
needle=['%'+i+'%' for i in needle]
|
|
like_query=' OR '.join(['file LIKE ?' for i in needle])
|
|
conn=sqlite3.connect(sqlfile)
|
|
conn.text_factory=str
|
|
db=conn.cursor()
|
|
db.execute("SELECT file FROM list WHERE "+like_query+" ORDER BY file",needle)
|
|
for row in db:
|
|
print(row[0])
|
|
|
|
def matchdb(sqlfile,needle,helper):
|
|
import difflib as dl
|
|
conn=sqlite3.connect(sqlfile)
|
|
conn.text_factory=str
|
|
db=conn.cursor()
|
|
if len(helper)>0:
|
|
helper=['%'+i+'%' for i in helper]
|
|
like_query=' OR '.join(['file LIKE ?' for i in helper])
|
|
db.execute("SELECT file FROM list WHERE "+like_query,helper)
|
|
else:
|
|
db.execute("SELECT file FROM list")
|
|
ratio=0
|
|
best_match=""
|
|
for row in db:
|
|
s=dl.SequenceMatcher(None, os.path.basename(row[0]), needle)
|
|
s_ratio=s.ratio()
|
|
if ratio < s_ratio:
|
|
ratio=s_ratio
|
|
best_match=row[0]
|
|
print(best_match)
|
|
|
|
def print_structure(files):
|
|
for hash in files:
|
|
#print(hash[0])
|
|
i=1
|
|
for f in hash[1]:
|
|
print "%(i)d: %(x)d:%(f)s " % {'i':i, 'f':f[0], 'x':f[1]}
|
|
i+=1
|
|
return
|
|
|
|
def main():
|
|
options=setup_options();
|
|
|
|
if not os.path.exists(options.sqlfile):
|
|
createdb(options.sqlfile);
|
|
if len(options.search)>0 and not options.match:
|
|
searchdb(options.sqlfile,options.search)
|
|
sys.exit(0)
|
|
if options.match:
|
|
matchdb(options.sqlfile,options.match,options.search)
|
|
sys.exit(0)
|
|
if options.delete:
|
|
print('Deleting entries...')
|
|
delete_nonexisting(options.sqlfile,options)
|
|
if options.add or options.changed:
|
|
print('Adding '+options.startpath+' entries...')
|
|
add_recurse(options)
|
|
if options.duplicate:
|
|
files=find_duplicates(options.sqlfile)
|
|
print_structure(files)
|
|
|
|
sys.exit(0)
|
|
|
|
main()
|
|
|