568 lines
20 KiB
Python
Executable File
568 lines
20 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# -*- coding: latin-1 -*-
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
from future import standard_library
|
|
standard_library.install_aliases()
|
|
from builtins import zip
|
|
from builtins import str
|
|
from past.utils import old_div
|
|
import sys
|
|
import os
|
|
import re
|
|
import sqlite3
|
|
import subprocess
|
|
import hashlib
|
|
import magic
|
|
from argparse import ArgumentParser
|
|
import configparser
|
|
import io
|
|
import datetime
|
|
|
|
SQLFILE='list_of_files.sqlite'
|
|
IMGMATCH=re.compile('.*\.jpg$|.*\.jpeg$|.*\.png$',re.I)
|
|
BADDIRS=[]
|
|
MINSIZE=0
|
|
MIME=magic.open(magic.MAGIC_NONE)
|
|
#MIME=magic.open(magic.MAGIC_MIME)
|
|
MIME.load()
|
|
ANIM=['.','·',"'","'",'·','.','_']
|
|
DEFAULT_CHUNK=1024*1024*50
|
|
|
|
def setup_options():
|
|
parser=ArgumentParser(description="Maintains the list of images sqlite file")
|
|
parser.add_argument("-a",action="store_false",dest="add",default=True,
|
|
help="Do not add new files [%(default)s]")
|
|
parser.add_argument("-c",action="store_true",dest="changed",default=False,
|
|
help="Modify changed files [%(default)s]")
|
|
parser.add_argument("--check",action="store_true",dest="check",default=False,
|
|
help="Check md5sums of files. Limit check with -s.")
|
|
parser.add_argument("-d",action="store_true",dest="delete",default=False,
|
|
help="Delete non-existing entries [%(default)s]")
|
|
parser.add_argument("--du",type=str,action='store',dest="diskused",default=False,
|
|
help="Print directory sizes. Argument is the path where directories are listed from.")
|
|
parser.add_argument("--du-depth",type=str,action='store',dest="diskused_depth",default=1,
|
|
help="Depth of summarization for --du.")
|
|
parser.add_argument("--dup",action="store_true",dest="duplicate",default=False,
|
|
help="Return a list of duplicate files, based on hashes. This option will flip the 'Add new files' option. [%(default)s]")
|
|
parser.add_argument("--dup-order",action="store",dest="duplicate_order",default='path',
|
|
help = "Order duplicates by a method. (length = path str length)",
|
|
choices = ('age','length','file','path')
|
|
)
|
|
parser.add_argument("--haschanges",action="store_true",dest="haschanges",default=False,
|
|
help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.")
|
|
parser.add_argument("--hasdeletions",action="store_true",dest="hasdeletions",default=False,
|
|
help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.")
|
|
parser.add_argument("--hasadditions",action="store_true",dest="hasadditions",default=False,
|
|
help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.")
|
|
parser.add_argument("-f",action="store",dest="sqlfile",default=SQLFILE,
|
|
help="SQL file name to use [%(default)s]")
|
|
parser.add_argument("-l",action="store_true",dest="symlinks",default=False,
|
|
help="Follow symbolic links [%(default)s]")
|
|
parser.add_argument("--match",type=str,dest="match",default=False,
|
|
help="Search for closest match from basenames, can be helped with adding -s")
|
|
parser.add_argument("-s",type=str,action='append',dest="search",default=[],
|
|
help="Search list based on path pattern")
|
|
parser.add_argument("-x",action="append",dest="exclude",default=[],
|
|
help="Exclude folder name from the lists. This option may be issued several times")
|
|
parser.add_argument("--full",action="store_true",dest="fullfile",default=False,
|
|
help="ONLY FOR NEW DB CREATION. Use full files to calculate md5 checksum. Defaults to first 50Mb. [%(default)s]")
|
|
parser.add_argument("--relative",action="store_true",dest="relative",default=False,
|
|
help="ONLY FOR NEW DB CREATION. Store filenames relative to database file.")
|
|
parser.add_argument('startpath', action="store",default='.', nargs='?')
|
|
|
|
options=parser.parse_args()
|
|
BADDIRS.extend(options.exclude)
|
|
if options.duplicate:
|
|
options.add=not options.add
|
|
|
|
options.sqlpath=os.path.dirname(os.path.realpath(options.sqlfile))
|
|
return options
|
|
|
|
|
|
def add_recurse(options):
|
|
conn=sqlite3.connect(options.sqlfile)
|
|
conn.text_factory=str
|
|
db=conn.cursor()
|
|
prev_path_len=0
|
|
for path,dirs,files in os.walk(options.startpath,followlinks=options.symlinks):
|
|
sys.stdout.write(("\r%s%s"%(filename_join(path,".",options),(prev_path_len-len(path))*' ')))
|
|
prev_path_len=len(path)
|
|
dirs=clean_dirs(dirs)
|
|
dirs.sort()
|
|
files.sort()
|
|
db_files=get_folder_contents(db,filename_join(path,"",options)+"/")
|
|
if not options.symlinks:
|
|
files=clean_syms(files,path)
|
|
for file in files:
|
|
filename=filename_join(path,file,options)
|
|
if file==options.sqlfile:
|
|
continue
|
|
if not os.path.isfile(filename):
|
|
continue
|
|
#if not is_listed(db,filename):
|
|
if file not in db_files:
|
|
if options.add:
|
|
add_single(conn,filename,change=False,fullfile=options.fullfile)
|
|
else:
|
|
if options.changed:
|
|
ftime=os.path.getmtime(filename)
|
|
if not ftime_match(db,filename,ftime):
|
|
#file content changed
|
|
add_single(conn,filename,change=True,fullfile=options.fullfile)
|
|
conn.commit()
|
|
sys.stdout.write("\n")
|
|
return
|
|
|
|
|
|
def add_single(conn,filename,change=False,hash=None,minsize=0,fullfile=False):
|
|
try:
|
|
fsize=os.path.getsize(filename)
|
|
hsize=humanize_size(fsize)
|
|
except IOError:
|
|
hsize=""
|
|
print("\r%s (%s)"%(filename,hsize))
|
|
db=conn.cursor()
|
|
try:
|
|
if hash==None:
|
|
hash=get_md5(filename,fullfile)
|
|
ftime=os.path.getmtime(filename)
|
|
mime=MIME.file(str(filename.encode('UTF-8')))
|
|
except IOError:
|
|
print("File '%s' not found. Bad link?"%(filename,))
|
|
return
|
|
except (UnicodeDecodeError, TypeError):
|
|
mime="NA"
|
|
|
|
if change:
|
|
db.execute("UPDATE list SET date=?, hash=?, size=?, mime=? \
|
|
WHERE file=?",(ftime,hash,fsize,mime,filename))
|
|
#print "changing: %(f)s " % {'f':filename}
|
|
else:
|
|
db.execute("INSERT INTO list(file,date,hash,size,mime)\
|
|
VALUES(?,?,?,?,?)",(filename,ftime,hash,fsize,mime))
|
|
sys.stdout.write('\r')
|
|
return
|
|
|
|
|
|
def checkdb(options):
|
|
needle=options.search
|
|
if len(needle)==0:
|
|
needle.append('%')
|
|
needle=['%'+i+'%' for i in needle]
|
|
like_query=' OR '.join(['file LIKE ?' for i in needle])
|
|
conn=sqlite3.connect(options.sqlfile)
|
|
conn.text_factory=str
|
|
db=conn.cursor()
|
|
db.execute("SELECT file,hash,size,date FROM list WHERE "+like_query+" ORDER BY file",needle)
|
|
missing=[]
|
|
differing=[]
|
|
OK_count=0
|
|
for row in db:
|
|
status='OK'
|
|
sys.stdout.write("\r%s"%(row[0],))
|
|
if os.path.exists(row[0]):
|
|
md5f=get_md5(row[0],options.fullfile)
|
|
if row[1]!=md5f:
|
|
status='Checksum-difference'
|
|
differing.append(row)
|
|
else:
|
|
status='Not-found'
|
|
missing.append(row)
|
|
sys.stdout.write("\r%s %s\n"%(row[0],status))
|
|
if status=='OK':
|
|
OK_count+=1
|
|
if len(differing)>0:
|
|
print_stderr("----\nDiffering files:")
|
|
pad=str(max([len(x[0]) for x in differing]))
|
|
for f in differing:
|
|
print(("%-"+pad+"s (%s %7s => %s %7s)")%(f[0],humanize_date(f[3]),humanize_size(f[2]),
|
|
humanize_date(os.path.getmtime(f[0])),
|
|
humanize_size(os.path.getsize(f[0]))))
|
|
if len(missing)>0:
|
|
print("----\nMissing files:")
|
|
pad=str(max([len(x[0]) for x in missing]))
|
|
for f in missing:
|
|
print(("%-"+pad+"s (%s %7s)")%(f[0],humanize_date(f[3]),humanize_size(f[2])))
|
|
(added,changed)=has_changes_additions(db,options,False)
|
|
if len(added)>0:
|
|
print("----\nAdded files:")
|
|
pad=str(max([len(x[0]) for x in added]))
|
|
for f in added:
|
|
print(("%-"+pad+"s (%s %7s)")%(f,
|
|
humanize_date(os.path.getmtime(f)),
|
|
humanize_size(os.path.getsize(f))))
|
|
|
|
print("----\nFile check summary:")
|
|
print("Database modified: %s"%(humanize_date(os.path.getmtime(options.sqlfile)),))
|
|
print("Checksum matches : %d"%(OK_count,))
|
|
print("Checksum mismatch: %d"%(len(differing),))
|
|
print("Files missing : %d"%(len(missing),))
|
|
print("Files added : %d"%(len(added),))
|
|
|
|
|
|
def clean_dirs(dirs):
|
|
for s in dirs[:]:
|
|
if (s in BADDIRS) or (s.startswith(".")):
|
|
dirs.remove(s)
|
|
return dirs
|
|
|
|
|
|
def clean_syms(files,path):
|
|
nonsyms=[]
|
|
for f in files:
|
|
if not os.path.islink(os.path.join(path,f)):
|
|
nonsyms.append(f)
|
|
return nonsyms
|
|
|
|
def createdb(options):
|
|
conn=sqlite3.connect(options.sqlfile)
|
|
db=conn.cursor()
|
|
conn.text_factory=str
|
|
db.execute('CREATE TABLE list (id INTEGER PRIMARY KEY AUTOINCREMENT,\
|
|
file TEXT,date INTEGER, hash TEXT,\
|
|
size INTEGER, mime TEXT)')
|
|
db.execute('CREATE TABLE config (id INTEGER PRIMARY KEY AUTOINCREMENT,\
|
|
object TEXT)')
|
|
conn.commit()
|
|
|
|
config = configparser.RawConfigParser()
|
|
config.add_section("General")
|
|
config.set("General","Relative",str(options.relative))
|
|
config.set("General","FullFile",str(options.fullfile))
|
|
store=io.StringIO()
|
|
config.write(store)
|
|
db.execute("INSERT INTO config (object) values (?)",(store.getvalue(),))
|
|
conn.commit()
|
|
return
|
|
|
|
|
|
def delete_nonexisting(sqlfile,options):
|
|
conn=sqlite3.connect(sqlfile)
|
|
conn.text_factory=str
|
|
db=conn.cursor()
|
|
dbdel=conn.cursor()
|
|
db.execute('SELECT file FROM list')
|
|
for row in db:
|
|
if os.path.exists(row[0]):
|
|
delete=False
|
|
if not options.symlinks:
|
|
if os.path.islink(row[0]):
|
|
delete=True
|
|
else:
|
|
delete=True
|
|
if delete:
|
|
print('removing.. '+row[0])
|
|
dbdel.execute("DELETE FROM list where file == ?",(row[0],))
|
|
conn.commit()
|
|
return
|
|
|
|
|
|
def disk_used(options):
|
|
conn=sqlite3.connect(options.sqlfile)
|
|
conn.text_factory=str
|
|
db=conn.cursor()
|
|
checkpath=filename_join(options.diskused,"",options)+"/"
|
|
if checkpath=="./":
|
|
checkpath=""
|
|
db.execute('SELECT size,replace(file,?,"") as path FROM list WHERE file LIKE ?',
|
|
(checkpath,
|
|
checkpath+"%",
|
|
))
|
|
entries=[]
|
|
sizes=[]
|
|
for row in db:
|
|
start_path=row[1].split('/')
|
|
start_path="/".join(start_path[0:int(options.diskused_depth)])
|
|
if start_path not in entries:
|
|
entries.append(start_path)
|
|
sizes.append(row[0])
|
|
else:
|
|
sizes[ entries.index(start_path) ]+=row[0]
|
|
for entry in zip(sizes,entries):
|
|
print("| ".join([ str(entry[0]).ljust(14),
|
|
humanize_size(entry[0]).rjust(8),
|
|
entry[1]]))
|
|
|
|
|
|
def filename_join(path,name,options):
|
|
filename=os.path.realpath(os.path.join(path,name))
|
|
if options.relative:
|
|
return os.path.relpath(filename, options.sqlpath)
|
|
return filename
|
|
|
|
|
|
def find_duplicates(sqlfile, order):
|
|
conn=sqlite3.connect(sqlfile)
|
|
conn.text_factory=str
|
|
db=conn.cursor()
|
|
dbh=conn.cursor()
|
|
db.execute("SELECT hash,count(*) FROM list WHERE size > 0 GROUP BY hash HAVING count(*) > 1 ")
|
|
duphash=[]
|
|
for row in db:
|
|
hash=row[0]
|
|
dbh.execute("SELECT file,size,date FROM list WHERE hash = ?",(hash,))
|
|
flist=[]
|
|
for row in dbh:
|
|
flist.append(row)
|
|
sort_by_method(flist, order)
|
|
duphash.append((hash, flist))
|
|
duphash.sort(key=lambda file: file[1][0])
|
|
return duphash
|
|
|
|
def ftime_match(db,filename,ftime):
|
|
db.execute("SELECT date FROM list where file == ?",(filename,))
|
|
count=db.fetchall()
|
|
return count[0][0]==ftime
|
|
|
|
def get_folder_contents(db,path):
|
|
''' return the contents of the folder '''
|
|
files=[]
|
|
if path=="./":
|
|
db.execute("SELECT file FROM list where file NOT LIKE ?",('%/%',))
|
|
path=""
|
|
else:
|
|
db.execute("SELECT file FROM list where file LIKE ?",(path+'%',))
|
|
for row in db:
|
|
try:
|
|
base=row[0].replace(path,'',1)
|
|
except UnicodeDecodeError:
|
|
print(row[0]+" is giving me trouble.")
|
|
try:
|
|
base=row[0].encode('utf-8').replace(path,'',1)
|
|
except UnicodeDecodeError:
|
|
print(row[0]+" is still giving me trouble.")
|
|
sys.exit(1)
|
|
if base.find('/')==-1:
|
|
files.append(base)
|
|
return files
|
|
|
|
|
|
def get_md5(filename,fullfile=False):
|
|
''' returns content based hash, only first 50Mb is read, unless user wants the whole file '''
|
|
fsize=os.path.getsize(filename)
|
|
if fullfile and fsize>DEFAULT_CHUNK:
|
|
anim_i=0
|
|
anim_len=len(ANIM)
|
|
block_size=2**24
|
|
percents_per_block=old_div(100,(old_div(float(fsize),block_size)))
|
|
md5 = hashlib.md5()
|
|
with open(filename,'rb') as f:
|
|
for chunk in iter(lambda: f.read(block_size), b''):
|
|
sys.stderr.write('\r %s (%02d%%)'%(ANIM[anim_i%anim_len],int(anim_i*percents_per_block)))
|
|
sys.stderr.flush()
|
|
anim_i+=1
|
|
md5.update(chunk)
|
|
sys.stderr.write('\r ')
|
|
return md5.hexdigest()
|
|
return hashlib.md5(open(filename,'rb').read(DEFAULT_CHUNK)).hexdigest()
|
|
|
|
|
|
def has_changes(options):
|
|
conn=sqlite3.connect(options.sqlfile)
|
|
conn.text_factory=str
|
|
db=conn.cursor()
|
|
if options.haschanges:
|
|
options.changed=True
|
|
if options.hasdeletions or options.haschanges:
|
|
has_changes_deleted(db)
|
|
if options.hasadditions or options.haschanges:
|
|
has_changes_additions(db,options)
|
|
|
|
def has_changes_deleted(db,exit=True):
|
|
db.execute('SELECT file FROM list')
|
|
deleted=[]
|
|
for row in db:
|
|
if not os.path.exists(row[0]):
|
|
if exit:
|
|
print('True')
|
|
sys.exit(1)
|
|
else:
|
|
deleted.append(row[0])
|
|
return deleted
|
|
|
|
|
|
def has_changes_additions(db,options,exit=True):
|
|
added=[]
|
|
changed=[]
|
|
for path,dirs,files in os.walk(options.startpath,followlinks=options.symlinks):
|
|
dirs=clean_dirs(dirs)
|
|
db_files=get_folder_contents(db,filename_join(path,"",options)+"/")
|
|
if not options.symlinks:
|
|
files=clean_syms(files,path)
|
|
for file in files:
|
|
filename=filename_join(path,file,options)
|
|
if file==options.sqlfile:
|
|
continue
|
|
#if not is_listed(db,filename):
|
|
if file not in db_files:
|
|
if exit:
|
|
print('True')
|
|
sys.exit(1)
|
|
else:
|
|
added.append(filename)
|
|
else:
|
|
if options.changed:
|
|
ftime=os.path.getmtime(filename)
|
|
if not ftime_match(db,filename,ftime):
|
|
#file content changed
|
|
if exit:
|
|
print('True')
|
|
sys.exit(1)
|
|
else:
|
|
changed.append(filename)
|
|
|
|
return (added,changed)
|
|
|
|
#~ def hash_match(db,filename,hash):
|
|
#~ db.execute("SELECT hash FROM list where file == ?",(filename,))
|
|
#~ count=db.fetchall()
|
|
#~ return count[0][0]==hash
|
|
|
|
def humanize_date(date):
|
|
if date==None:
|
|
return ''
|
|
return datetime.datetime.fromtimestamp(int(date)).strftime('%Y-%m-%d %H:%M:%S')
|
|
|
|
|
|
def humanize_size(size,precision=1):
|
|
if size==None:
|
|
return 'nan'
|
|
suffixes=['B','KB','MB','GB','TB']
|
|
suffixIndex = 0
|
|
defPrecision=0
|
|
while size > 1024:
|
|
suffixIndex += 1 #increment the index of the suffix
|
|
size = old_div(size,1024.0) #apply the division
|
|
defPrecision=precision
|
|
return "%.*f%s"%(defPrecision,size,suffixes[suffixIndex])
|
|
|
|
|
|
def is_listed(db,filename):
|
|
db.execute("SELECT COUNT(*) FROM list where file == ?",(filename,))
|
|
count=db.fetchall()
|
|
return count[0][0]>0
|
|
|
|
|
|
def matchdb(sqlfile,needle,helper):
|
|
needle=needle.lower()
|
|
import difflib as dl
|
|
conn=sqlite3.connect(sqlfile)
|
|
conn.text_factory=str
|
|
db=conn.cursor()
|
|
if len(helper)>0:
|
|
helper=['%'+i+'%' for i in helper]
|
|
like_query=' OR '.join(['file LIKE ?' for i in helper])
|
|
db.execute("SELECT file FROM list WHERE "+like_query+" ORDER BY date DESC",helper)
|
|
else:
|
|
db.execute("SELECT file FROM list ORDER BY date DESC")
|
|
ratio=0
|
|
best_match=""
|
|
for row in db:
|
|
s=dl.SequenceMatcher(None, os.path.basename(row[0]).lower(), needle)
|
|
s_ratio=s.ratio()
|
|
if ratio < s_ratio:
|
|
ratio=s_ratio
|
|
best_match=row[0]
|
|
print(best_match)
|
|
|
|
def print_duplicates(files):
|
|
for hash in files:
|
|
#print(hash[0])
|
|
i=1
|
|
for f in hash[1]:
|
|
print("%(i)d|%(s)s|%(d)s|%(f)s " % {
|
|
'i':i,
|
|
'f':f[0],
|
|
'd': humanize_date(f[2]),
|
|
's': humanize_size(f[1])
|
|
})
|
|
i+=1
|
|
return
|
|
|
|
|
|
def print_stderr(s):
|
|
sys.stderr.write(s)
|
|
sys.stderr.write("\n")
|
|
sys.stderr.flush()
|
|
|
|
|
|
def searchdb(sqlfile,needle):
|
|
needle=['%'+i+'%' for i in needle]
|
|
like_query=' OR '.join(['file LIKE ?' for i in needle])
|
|
conn=sqlite3.connect(sqlfile)
|
|
conn.text_factory=str
|
|
db=conn.cursor()
|
|
db.execute("SELECT file FROM list WHERE "+like_query+" ORDER BY file",needle)
|
|
for row in db:
|
|
print(row[0])
|
|
|
|
|
|
def sort_by_method(flist, order):
|
|
if order == 'path':
|
|
flist.sort(key=lambda file: file[0])
|
|
if order == 'file':
|
|
flist.sort(key=lambda file: os.path.basename(file[0]))
|
|
if order == 'age':
|
|
flist.sort(key=lambda file: file[2])
|
|
if order == 'length':
|
|
flist.sort(key=lambda file: len(file[0]))
|
|
|
|
|
|
def stored_options(options):
|
|
try:
|
|
conn=sqlite3.connect(options.sqlfile)
|
|
db=conn.cursor()
|
|
conn.text_factory=str
|
|
db.execute("SELECT object FROM config")
|
|
store=""
|
|
for row in db:
|
|
store+=row[0]+'\n'
|
|
config = configparser.RawConfigParser()
|
|
config.readfp(io.BytesIO(store))
|
|
options.relative=config.getboolean("General","Relative")
|
|
options.fullfile=config.getboolean("General","FullFile")
|
|
except:
|
|
pass
|
|
|
|
return options
|
|
|
|
|
|
def main():
|
|
options=setup_options();
|
|
|
|
if not os.path.exists(options.sqlfile):
|
|
createdb(options);
|
|
options=stored_options(options)
|
|
if options.relative:
|
|
os.chdir(options.sqlpath)
|
|
if options.haschanges or options.hasadditions or options.hasdeletions:
|
|
has_changes(options)
|
|
sys.exit(0)
|
|
if options.check:
|
|
checkdb(options)
|
|
sys.exit(0)
|
|
if len(options.search)>0 and not options.match:
|
|
searchdb(options.sqlfile,options.search)
|
|
sys.exit(0)
|
|
if options.match:
|
|
matchdb(options.sqlfile,options.match,options.search)
|
|
sys.exit(0)
|
|
if options.diskused:
|
|
disk_used(options)
|
|
sys.exit(0)
|
|
if options.delete:
|
|
print('Deleting entries...')
|
|
delete_nonexisting(options.sqlfile,options)
|
|
if options.add or options.changed:
|
|
print('Adding '+options.startpath+' entries...')
|
|
add_recurse(options)
|
|
if options.duplicate:
|
|
files=find_duplicates(options.sqlfile, options.duplicate_order)
|
|
print_duplicates(files)
|
|
|
|
sys.exit(0)
|
|
|
|
main()
|
|
|