ordering methods for duplicates

This commit is contained in:
q
2018-05-26 11:40:09 +03:00
parent 3405f45501
commit 162c10e846

View File

@@ -7,7 +7,7 @@ import sqlite3
import subprocess import subprocess
import hashlib import hashlib
import magic import magic
from argparse import ArgumentParser from argparse import ArgumentParser
import ConfigParser,StringIO,io import ConfigParser,StringIO,io
import datetime import datetime
@@ -37,6 +37,10 @@ def setup_options():
help="Depth of summarization for --du.") help="Depth of summarization for --du.")
parser.add_argument("--dup",action="store_true",dest="duplicate",default=False, parser.add_argument("--dup",action="store_true",dest="duplicate",default=False,
help="Return a list of duplicate files, based on hashes. This option will flip the 'Add new files' option. [%(default)s]") help="Return a list of duplicate files, based on hashes. This option will flip the 'Add new files' option. [%(default)s]")
parser.add_argument("--dup-order",action="store",dest="duplicate_order",default='path',
help = "Order duplicates by a method. (length = path str length)",
choices = ('age','length','file','path')
)
parser.add_argument("--haschanges",action="store_true",dest="haschanges",default=False, parser.add_argument("--haschanges",action="store_true",dest="haschanges",default=False,
help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.") help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.")
parser.add_argument("--hasdeletions",action="store_true",dest="hasdeletions",default=False, parser.add_argument("--hasdeletions",action="store_true",dest="hasdeletions",default=False,
@@ -67,6 +71,7 @@ def setup_options():
options.sqlpath=os.path.dirname(os.path.realpath(options.sqlfile)) options.sqlpath=os.path.dirname(os.path.realpath(options.sqlfile))
return options return options
def add_recurse(options): def add_recurse(options):
conn=sqlite3.connect(options.sqlfile) conn=sqlite3.connect(options.sqlfile)
conn.text_factory=str conn.text_factory=str
@@ -101,6 +106,7 @@ def add_recurse(options):
sys.stdout.write("\n") sys.stdout.write("\n")
return return
def add_single(conn,filename,change=False,hash=None,minsize=0,fullfile=False): def add_single(conn,filename,change=False,hash=None,minsize=0,fullfile=False):
try: try:
fsize=os.path.getsize(filename) fsize=os.path.getsize(filename)
@@ -119,7 +125,7 @@ def add_single(conn,filename,change=False,hash=None,minsize=0,fullfile=False):
return return
except UnicodeDecodeError: except UnicodeDecodeError:
mime="NA" mime="NA"
if change: if change:
db.execute("UPDATE list SET date=?, hash=?, size=?, mime=? \ db.execute("UPDATE list SET date=?, hash=?, size=?, mime=? \
WHERE file=?",(ftime,hash,fsize,mime,filename)) WHERE file=?",(ftime,hash,fsize,mime,filename))
@@ -130,6 +136,7 @@ def add_single(conn,filename,change=False,hash=None,minsize=0,fullfile=False):
sys.stdout.write('\r') sys.stdout.write('\r')
return return
def checkdb(options): def checkdb(options):
needle=options.search needle=options.search
if len(needle)==0: if len(needle)==0:
@@ -177,14 +184,14 @@ def checkdb(options):
print(("%-"+pad+"s (%s %7s)")%(f, print(("%-"+pad+"s (%s %7s)")%(f,
humanize_date(os.path.getmtime(f)), humanize_date(os.path.getmtime(f)),
humanize_size(os.path.getsize(f)))) humanize_size(os.path.getsize(f))))
print("----\nFile check summary:") print("----\nFile check summary:")
print("Database modified: %s"%(humanize_date(os.path.getmtime(options.sqlfile)),)) print("Database modified: %s"%(humanize_date(os.path.getmtime(options.sqlfile)),))
print("Checksum matches : %d"%(OK_count,)) print("Checksum matches : %d"%(OK_count,))
print("Checksum mismatch: %d"%(len(differing),)) print("Checksum mismatch: %d"%(len(differing),))
print("Files missing : %d"%(len(missing),)) print("Files missing : %d"%(len(missing),))
print("Files added : %d"%(len(added),)) print("Files added : %d"%(len(added),))
def clean_dirs(dirs): def clean_dirs(dirs):
for s in dirs[:]: for s in dirs[:]:
@@ -192,6 +199,7 @@ def clean_dirs(dirs):
dirs.remove(s) dirs.remove(s)
return dirs return dirs
def clean_syms(files,path): def clean_syms(files,path):
nonsyms=[] nonsyms=[]
for f in files: for f in files:
@@ -208,8 +216,8 @@ def createdb(options):
size INTEGER, mime TEXT)') size INTEGER, mime TEXT)')
db.execute('CREATE TABLE config (id INTEGER PRIMARY KEY AUTOINCREMENT,\ db.execute('CREATE TABLE config (id INTEGER PRIMARY KEY AUTOINCREMENT,\
object TEXT)') object TEXT)')
conn.commit() conn.commit()
config = ConfigParser.RawConfigParser() config = ConfigParser.RawConfigParser()
config.add_section("General") config.add_section("General")
config.set("General","Relative",str(options.relative)) config.set("General","Relative",str(options.relative))
@@ -217,9 +225,10 @@ def createdb(options):
store=StringIO.StringIO() store=StringIO.StringIO()
config.write(store) config.write(store)
db.execute("INSERT INTO config (object) values (?)",(store.getvalue(),)) db.execute("INSERT INTO config (object) values (?)",(store.getvalue(),))
conn.commit() conn.commit()
return return
def delete_nonexisting(sqlfile,options): def delete_nonexisting(sqlfile,options):
conn=sqlite3.connect(sqlfile) conn=sqlite3.connect(sqlfile)
conn.text_factory=str conn.text_factory=str
@@ -236,10 +245,11 @@ def delete_nonexisting(sqlfile,options):
delete=True delete=True
if delete: if delete:
print('removing.. '+row[0]) print('removing.. '+row[0])
dbdel.execute("DELETE FROM list where file == ?",(row[0],)) dbdel.execute("DELETE FROM list where file == ?",(row[0],))
conn.commit() conn.commit()
return return
def disk_used(options): def disk_used(options):
conn=sqlite3.connect(options.sqlfile) conn=sqlite3.connect(options.sqlfile)
conn.text_factory=str conn.text_factory=str
@@ -262,17 +272,19 @@ def disk_used(options):
else: else:
sizes[ entries.index(start_path) ]+=row[0] sizes[ entries.index(start_path) ]+=row[0]
for entry in zip(sizes,entries): for entry in zip(sizes,entries):
print("| ".join([ str(entry[0]).ljust(14), print("| ".join([ str(entry[0]).ljust(14),
humanize_size(entry[0]).rjust(8), humanize_size(entry[0]).rjust(8),
entry[1]])) entry[1]]))
def filename_join(path,name,options): def filename_join(path,name,options):
filename=os.path.realpath(os.path.join(path,name)) filename=os.path.realpath(os.path.join(path,name))
if options.relative: if options.relative:
return os.path.relpath(filename, options.sqlpath) return os.path.relpath(filename, options.sqlpath)
return filename return filename
def find_duplicates(sqlfile):
def find_duplicates(sqlfile, order):
conn=sqlite3.connect(sqlfile) conn=sqlite3.connect(sqlfile)
conn.text_factory=str conn.text_factory=str
db=conn.cursor() db=conn.cursor()
@@ -285,7 +297,7 @@ def find_duplicates(sqlfile):
flist=[] flist=[]
for row in dbh: for row in dbh:
flist.append(row) flist.append(row)
flist.sort(key=lambda file: file[0]) sort_by_method(flist, order)
duphash.append((hash, flist)) duphash.append((hash, flist))
duphash.sort(key=lambda file: file[1][0]) duphash.sort(key=lambda file: file[1][0])
return duphash return duphash
@@ -317,6 +329,7 @@ def get_folder_contents(db,path):
files.append(base) files.append(base)
return files return files
def get_md5(filename,fullfile=False): def get_md5(filename,fullfile=False):
''' returns content based hash, only first 50Mb is read, unless user wants the whole file ''' ''' returns content based hash, only first 50Mb is read, unless user wants the whole file '''
fsize=os.path.getsize(filename) fsize=os.path.getsize(filename)
@@ -326,8 +339,8 @@ def get_md5(filename,fullfile=False):
block_size=2**24 block_size=2**24
percents_per_block=100/(float(fsize)/block_size) percents_per_block=100/(float(fsize)/block_size)
md5 = hashlib.md5() md5 = hashlib.md5()
with open(filename,'rb') as f: with open(filename,'rb') as f:
for chunk in iter(lambda: f.read(block_size), b''): for chunk in iter(lambda: f.read(block_size), b''):
sys.stderr.write('\r %s (%02d%%)'%(ANIM[anim_i%anim_len],int(anim_i*percents_per_block))) sys.stderr.write('\r %s (%02d%%)'%(ANIM[anim_i%anim_len],int(anim_i*percents_per_block)))
sys.stderr.flush() sys.stderr.flush()
anim_i+=1 anim_i+=1
@@ -347,7 +360,7 @@ def has_changes(options):
has_changes_deleted(db) has_changes_deleted(db)
if options.hasadditions or options.haschanges: if options.hasadditions or options.haschanges:
has_changes_additions(db,options) has_changes_additions(db,options)
def has_changes_deleted(db,exit=True): def has_changes_deleted(db,exit=True):
db.execute('SELECT file FROM list') db.execute('SELECT file FROM list')
deleted=[] deleted=[]
@@ -360,6 +373,7 @@ def has_changes_deleted(db,exit=True):
deleted.append(row[0]) deleted.append(row[0])
return deleted return deleted
def has_changes_additions(db,options,exit=True): def has_changes_additions(db,options,exit=True):
added=[] added=[]
changed=[] changed=[]
@@ -389,7 +403,7 @@ def has_changes_additions(db,options,exit=True):
sys.exit(1) sys.exit(1)
else: else:
changed.append(filename) changed.append(filename)
return (added,changed) return (added,changed)
#~ def hash_match(db,filename,hash): #~ def hash_match(db,filename,hash):
@@ -402,6 +416,7 @@ def humanize_date(date):
return '' return ''
return datetime.datetime.fromtimestamp(int(date)).strftime('%Y-%m-%d %H:%M:%S') return datetime.datetime.fromtimestamp(int(date)).strftime('%Y-%m-%d %H:%M:%S')
def humanize_size(size,precision=1): def humanize_size(size,precision=1):
if size==None: if size==None:
return 'nan' return 'nan'
@@ -414,11 +429,13 @@ def humanize_size(size,precision=1):
defPrecision=precision defPrecision=precision
return "%.*f%s"%(defPrecision,size,suffixes[suffixIndex]) return "%.*f%s"%(defPrecision,size,suffixes[suffixIndex])
def is_listed(db,filename): def is_listed(db,filename):
db.execute("SELECT COUNT(*) FROM list where file == ?",(filename,)) db.execute("SELECT COUNT(*) FROM list where file == ?",(filename,))
count=db.fetchall() count=db.fetchall()
return count[0][0]>0 return count[0][0]>0
def matchdb(sqlfile,needle,helper): def matchdb(sqlfile,needle,helper):
needle=needle.lower() needle=needle.lower()
import difflib as dl import difflib as dl
@@ -441,20 +458,27 @@ def matchdb(sqlfile,needle,helper):
best_match=row[0] best_match=row[0]
print(best_match) print(best_match)
def print_structure(files): def print_duplicates(files):
for hash in files: for hash in files:
#print(hash[0]) #print(hash[0])
i=1 i=1
for f in hash[1]: for f in hash[1]:
print "%(i)d: %(x)d:%(f)s " % {'i':i, 'f':f[0], 'x':f[1]} print("%(i)d|%(s)s|%(d)s|%(f)s " % {
'i':i,
'f':f[0],
'd': humanize_date(f[2]),
's': humanize_size(f[1])
})
i+=1 i+=1
return return
def print_stderr(s): def print_stderr(s):
sys.stderr.write(s) sys.stderr.write(s)
sys.stderr.write("\n") sys.stderr.write("\n")
sys.stderr.flush() sys.stderr.flush()
def searchdb(sqlfile,needle): def searchdb(sqlfile,needle):
needle=['%'+i+'%' for i in needle] needle=['%'+i+'%' for i in needle]
like_query=' OR '.join(['file LIKE ?' for i in needle]) like_query=' OR '.join(['file LIKE ?' for i in needle])
@@ -465,6 +489,18 @@ def searchdb(sqlfile,needle):
for row in db: for row in db:
print(row[0]) print(row[0])
def sort_by_method(flist, order):
if order == 'path':
flist.sort(key=lambda file: file[0])
if order == 'file':
flist.sort(key=lambda file: os.path.basename(file[0]))
if order == 'age':
flist.sort(key=lambda file: file[2])
if order == 'length':
flist.sort(key=lambda file: len(file[0]))
def stored_options(options): def stored_options(options):
try: try:
conn=sqlite3.connect(options.sqlfile) conn=sqlite3.connect(options.sqlfile)
@@ -483,6 +519,7 @@ def stored_options(options):
return options return options
def main(): def main():
options=setup_options(); options=setup_options();
@@ -507,15 +544,15 @@ def main():
disk_used(options) disk_used(options)
sys.exit(0) sys.exit(0)
if options.delete: if options.delete:
print('Deleting entries...') print('Deleting entries...')
delete_nonexisting(options.sqlfile,options) delete_nonexisting(options.sqlfile,options)
if options.add or options.changed: if options.add or options.changed:
print('Adding '+options.startpath+' entries...') print('Adding '+options.startpath+' entries...')
add_recurse(options) add_recurse(options)
if options.duplicate: if options.duplicate:
files=find_duplicates(options.sqlfile) files=find_duplicates(options.sqlfile, options.duplicate_order)
print_structure(files) print_duplicates(files)
sys.exit(0) sys.exit(0)
main() main()