ordering methods for duplicates

This commit is contained in:
q
2018-05-26 11:40:09 +03:00
parent 3405f45501
commit 162c10e846

View File

@@ -37,6 +37,10 @@ def setup_options():
help="Depth of summarization for --du.") help="Depth of summarization for --du.")
parser.add_argument("--dup",action="store_true",dest="duplicate",default=False, parser.add_argument("--dup",action="store_true",dest="duplicate",default=False,
help="Return a list of duplicate files, based on hashes. This option will flip the 'Add new files' option. [%(default)s]") help="Return a list of duplicate files, based on hashes. This option will flip the 'Add new files' option. [%(default)s]")
parser.add_argument("--dup-order",action="store",dest="duplicate_order",default='path',
help = "Order duplicates by a method. (length = path str length)",
choices = ('age','length','file','path')
)
parser.add_argument("--haschanges",action="store_true",dest="haschanges",default=False, parser.add_argument("--haschanges",action="store_true",dest="haschanges",default=False,
help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.") help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.")
parser.add_argument("--hasdeletions",action="store_true",dest="hasdeletions",default=False, parser.add_argument("--hasdeletions",action="store_true",dest="hasdeletions",default=False,
@@ -67,6 +71,7 @@ def setup_options():
options.sqlpath=os.path.dirname(os.path.realpath(options.sqlfile)) options.sqlpath=os.path.dirname(os.path.realpath(options.sqlfile))
return options return options
def add_recurse(options): def add_recurse(options):
conn=sqlite3.connect(options.sqlfile) conn=sqlite3.connect(options.sqlfile)
conn.text_factory=str conn.text_factory=str
@@ -101,6 +106,7 @@ def add_recurse(options):
sys.stdout.write("\n") sys.stdout.write("\n")
return return
def add_single(conn,filename,change=False,hash=None,minsize=0,fullfile=False): def add_single(conn,filename,change=False,hash=None,minsize=0,fullfile=False):
try: try:
fsize=os.path.getsize(filename) fsize=os.path.getsize(filename)
@@ -130,6 +136,7 @@ def add_single(conn,filename,change=False,hash=None,minsize=0,fullfile=False):
sys.stdout.write('\r') sys.stdout.write('\r')
return return
def checkdb(options): def checkdb(options):
needle=options.search needle=options.search
if len(needle)==0: if len(needle)==0:
@@ -192,6 +199,7 @@ def clean_dirs(dirs):
dirs.remove(s) dirs.remove(s)
return dirs return dirs
def clean_syms(files,path): def clean_syms(files,path):
nonsyms=[] nonsyms=[]
for f in files: for f in files:
@@ -220,6 +228,7 @@ def createdb(options):
conn.commit() conn.commit()
return return
def delete_nonexisting(sqlfile,options): def delete_nonexisting(sqlfile,options):
conn=sqlite3.connect(sqlfile) conn=sqlite3.connect(sqlfile)
conn.text_factory=str conn.text_factory=str
@@ -240,6 +249,7 @@ def delete_nonexisting(sqlfile,options):
conn.commit() conn.commit()
return return
def disk_used(options): def disk_used(options):
conn=sqlite3.connect(options.sqlfile) conn=sqlite3.connect(options.sqlfile)
conn.text_factory=str conn.text_factory=str
@@ -266,13 +276,15 @@ def disk_used(options):
humanize_size(entry[0]).rjust(8), humanize_size(entry[0]).rjust(8),
entry[1]])) entry[1]]))
def filename_join(path,name,options): def filename_join(path,name,options):
filename=os.path.realpath(os.path.join(path,name)) filename=os.path.realpath(os.path.join(path,name))
if options.relative: if options.relative:
return os.path.relpath(filename, options.sqlpath) return os.path.relpath(filename, options.sqlpath)
return filename return filename
def find_duplicates(sqlfile):
def find_duplicates(sqlfile, order):
conn=sqlite3.connect(sqlfile) conn=sqlite3.connect(sqlfile)
conn.text_factory=str conn.text_factory=str
db=conn.cursor() db=conn.cursor()
@@ -285,7 +297,7 @@ def find_duplicates(sqlfile):
flist=[] flist=[]
for row in dbh: for row in dbh:
flist.append(row) flist.append(row)
flist.sort(key=lambda file: file[0]) sort_by_method(flist, order)
duphash.append((hash, flist)) duphash.append((hash, flist))
duphash.sort(key=lambda file: file[1][0]) duphash.sort(key=lambda file: file[1][0])
return duphash return duphash
@@ -317,6 +329,7 @@ def get_folder_contents(db,path):
files.append(base) files.append(base)
return files return files
def get_md5(filename,fullfile=False): def get_md5(filename,fullfile=False):
''' returns content based hash, only first 50Mb is read, unless user wants the whole file ''' ''' returns content based hash, only first 50Mb is read, unless user wants the whole file '''
fsize=os.path.getsize(filename) fsize=os.path.getsize(filename)
@@ -360,6 +373,7 @@ def has_changes_deleted(db,exit=True):
deleted.append(row[0]) deleted.append(row[0])
return deleted return deleted
def has_changes_additions(db,options,exit=True): def has_changes_additions(db,options,exit=True):
added=[] added=[]
changed=[] changed=[]
@@ -402,6 +416,7 @@ def humanize_date(date):
return '' return ''
return datetime.datetime.fromtimestamp(int(date)).strftime('%Y-%m-%d %H:%M:%S') return datetime.datetime.fromtimestamp(int(date)).strftime('%Y-%m-%d %H:%M:%S')
def humanize_size(size,precision=1): def humanize_size(size,precision=1):
if size==None: if size==None:
return 'nan' return 'nan'
@@ -414,11 +429,13 @@ def humanize_size(size,precision=1):
defPrecision=precision defPrecision=precision
return "%.*f%s"%(defPrecision,size,suffixes[suffixIndex]) return "%.*f%s"%(defPrecision,size,suffixes[suffixIndex])
def is_listed(db,filename): def is_listed(db,filename):
db.execute("SELECT COUNT(*) FROM list where file == ?",(filename,)) db.execute("SELECT COUNT(*) FROM list where file == ?",(filename,))
count=db.fetchall() count=db.fetchall()
return count[0][0]>0 return count[0][0]>0
def matchdb(sqlfile,needle,helper): def matchdb(sqlfile,needle,helper):
needle=needle.lower() needle=needle.lower()
import difflib as dl import difflib as dl
@@ -441,20 +458,27 @@ def matchdb(sqlfile,needle,helper):
best_match=row[0] best_match=row[0]
print(best_match) print(best_match)
def print_structure(files): def print_duplicates(files):
for hash in files: for hash in files:
#print(hash[0]) #print(hash[0])
i=1 i=1
for f in hash[1]: for f in hash[1]:
print "%(i)d: %(x)d:%(f)s " % {'i':i, 'f':f[0], 'x':f[1]} print("%(i)d|%(s)s|%(d)s|%(f)s " % {
'i':i,
'f':f[0],
'd': humanize_date(f[2]),
's': humanize_size(f[1])
})
i+=1 i+=1
return return
def print_stderr(s): def print_stderr(s):
sys.stderr.write(s) sys.stderr.write(s)
sys.stderr.write("\n") sys.stderr.write("\n")
sys.stderr.flush() sys.stderr.flush()
def searchdb(sqlfile,needle): def searchdb(sqlfile,needle):
needle=['%'+i+'%' for i in needle] needle=['%'+i+'%' for i in needle]
like_query=' OR '.join(['file LIKE ?' for i in needle]) like_query=' OR '.join(['file LIKE ?' for i in needle])
@@ -465,6 +489,18 @@ def searchdb(sqlfile,needle):
for row in db: for row in db:
print(row[0]) print(row[0])
def sort_by_method(flist, order):
if order == 'path':
flist.sort(key=lambda file: file[0])
if order == 'file':
flist.sort(key=lambda file: os.path.basename(file[0]))
if order == 'age':
flist.sort(key=lambda file: file[2])
if order == 'length':
flist.sort(key=lambda file: len(file[0]))
def stored_options(options): def stored_options(options):
try: try:
conn=sqlite3.connect(options.sqlfile) conn=sqlite3.connect(options.sqlfile)
@@ -483,6 +519,7 @@ def stored_options(options):
return options return options
def main(): def main():
options=setup_options(); options=setup_options();
@@ -513,8 +550,8 @@ def main():
print('Adding '+options.startpath+' entries...') print('Adding '+options.startpath+' entries...')
add_recurse(options) add_recurse(options)
if options.duplicate: if options.duplicate:
files=find_duplicates(options.sqlfile) files=find_duplicates(options.sqlfile, options.duplicate_order)
print_structure(files) print_duplicates(files)
sys.exit(0) sys.exit(0)