ordering methods for duplicates

2018-05-26 11:40:09 +03:00
parent 3405f45501
commit 162c10e846
1 changed files with 60 additions and 23 deletions
--- a/files/file_list.py
+++ b/files/file_list.py
@@ -7,7 +7,7 @@ import sqlite3
 import subprocess
 import hashlib
 import magic
-from argparse import ArgumentParser 
+from argparse import ArgumentParser
 import ConfigParser,StringIO,io
 import datetime
@@ -37,6 +37,10 @@ def setup_options():
                      help="Depth of summarization for --du.")
    parser.add_argument("--dup",action="store_true",dest="duplicate",default=False,
                     help="Return a list of duplicate files, based on hashes. This option will flip the 'Add new files' option. [%(default)s]")
    parser.add_argument("--dup-order",action="store",dest="duplicate_order",default='path',
        help = "Order duplicates by a method. (length = path str length)",
        choices = ('age','length','file','path')
    )
    parser.add_argument("--haschanges",action="store_true",dest="haschanges",default=False,
                      help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.")
    parser.add_argument("--hasdeletions",action="store_true",dest="hasdeletions",default=False,
@@ -67,6 +71,7 @@ def setup_options():
    options.sqlpath=os.path.dirname(os.path.realpath(options.sqlfile))
    return options
 def add_recurse(options):
    conn=sqlite3.connect(options.sqlfile)
    conn.text_factory=str
@@ -101,6 +106,7 @@ def add_recurse(options):
    sys.stdout.write("\n")
    return
 def add_single(conn,filename,change=False,hash=None,minsize=0,fullfile=False):
    try:
        fsize=os.path.getsize(filename)
@@ -119,7 +125,7 @@ def add_single(conn,filename,change=False,hash=None,minsize=0,fullfile=False):
        return
    except UnicodeDecodeError:
        mime="NA"
-    
+
    if change:
        db.execute("UPDATE list SET date=?, hash=?, size=?, mime=? \
               WHERE file=?",(ftime,hash,fsize,mime,filename))
@@ -130,6 +136,7 @@ def add_single(conn,filename,change=False,hash=None,minsize=0,fullfile=False):
    sys.stdout.write('\r')
    return
 def checkdb(options):
    needle=options.search
    if len(needle)==0:
@@ -177,14 +184,14 @@ def checkdb(options):
            print(("%-"+pad+"s (%s %7s)")%(f,
                                                humanize_date(os.path.getmtime(f)),
                                                humanize_size(os.path.getsize(f))))
-    
+
    print("----\nFile check summary:")
    print("Database modified: %s"%(humanize_date(os.path.getmtime(options.sqlfile)),))
    print("Checksum matches : %d"%(OK_count,))
    print("Checksum mismatch: %d"%(len(differing),))
    print("Files missing    : %d"%(len(missing),))
    print("Files added      : %d"%(len(added),))
-    
+
 def clean_dirs(dirs):
    for s in dirs[:]:
@@ -192,6 +199,7 @@ def clean_dirs(dirs):
            dirs.remove(s)
    return dirs
 def clean_syms(files,path):
    nonsyms=[]
    for f in files:
@@ -208,8 +216,8 @@ def createdb(options):
               size INTEGER, mime TEXT)')
    db.execute('CREATE TABLE config (id INTEGER PRIMARY KEY AUTOINCREMENT,\
               object TEXT)')
-    conn.commit() 
+    conn.commit()
-    
+
    config = ConfigParser.RawConfigParser()
    config.add_section("General")
    config.set("General","Relative",str(options.relative))
@@ -217,9 +225,10 @@ def createdb(options):
    store=StringIO.StringIO()
    config.write(store)
    db.execute("INSERT INTO config (object) values (?)",(store.getvalue(),))
-    conn.commit() 
+    conn.commit()
    return
 def delete_nonexisting(sqlfile,options):
    conn=sqlite3.connect(sqlfile)
    conn.text_factory=str
@@ -236,10 +245,11 @@ def delete_nonexisting(sqlfile,options):
            delete=True
        if delete:
            print('removing.. '+row[0])
-            dbdel.execute("DELETE FROM list where file == ?",(row[0],))            
+            dbdel.execute("DELETE FROM list where file == ?",(row[0],))
    conn.commit()
    return
 def disk_used(options):
    conn=sqlite3.connect(options.sqlfile)
    conn.text_factory=str
@@ -262,17 +272,19 @@ def disk_used(options):
        else:
            sizes[ entries.index(start_path) ]+=row[0]
    for entry in zip(sizes,entries):
-        print("| ".join([ str(entry[0]).ljust(14), 
+        print("| ".join([ str(entry[0]).ljust(14),
-                         humanize_size(entry[0]).rjust(8), 
+                         humanize_size(entry[0]).rjust(8),
                         entry[1]]))
-    
+
 def filename_join(path,name,options):
    filename=os.path.realpath(os.path.join(path,name))
    if options.relative:
        return os.path.relpath(filename, options.sqlpath)
    return filename
-def find_duplicates(sqlfile):
+
 def find_duplicates(sqlfile, order):
    conn=sqlite3.connect(sqlfile)
    conn.text_factory=str
    db=conn.cursor()
@@ -285,7 +297,7 @@ def find_duplicates(sqlfile):
        flist=[]
        for row in dbh:
            flist.append(row)
-        flist.sort(key=lambda file: file[0])
+        sort_by_method(flist, order)
        duphash.append((hash, flist))
    duphash.sort(key=lambda file: file[1][0])
    return duphash
@@ -317,6 +329,7 @@ def get_folder_contents(db,path):
            files.append(base)
    return files
 def get_md5(filename,fullfile=False):
    ''' returns content based hash, only first 50Mb is read, unless user wants the whole file '''
    fsize=os.path.getsize(filename)
@@ -326,8 +339,8 @@ def get_md5(filename,fullfile=False):
        block_size=2**24
        percents_per_block=100/(float(fsize)/block_size)
        md5 = hashlib.md5()
-        with open(filename,'rb') as f: 
+        with open(filename,'rb') as f:
-            for chunk in iter(lambda: f.read(block_size), b''): 
+            for chunk in iter(lambda: f.read(block_size), b''):
                sys.stderr.write('\r %s (%02d%%)'%(ANIM[anim_i%anim_len],int(anim_i*percents_per_block)))
                sys.stderr.flush()
                anim_i+=1
@@ -347,7 +360,7 @@ def has_changes(options):
        has_changes_deleted(db)
    if options.hasadditions or options.haschanges:
        has_changes_additions(db,options)
-    
+
 def has_changes_deleted(db,exit=True):
    db.execute('SELECT file FROM list')
    deleted=[]
@@ -360,6 +373,7 @@ def has_changes_deleted(db,exit=True):
                deleted.append(row[0])
    return deleted
 def has_changes_additions(db,options,exit=True):
    added=[]
    changed=[]
@@ -389,7 +403,7 @@ def has_changes_additions(db,options,exit=True):
                            sys.exit(1)
                        else:
                            changed.append(filename)
-                    
+
    return (added,changed)
 #~ def hash_match(db,filename,hash):
@@ -402,6 +416,7 @@ def humanize_date(date):
        return ''
    return datetime.datetime.fromtimestamp(int(date)).strftime('%Y-%m-%d %H:%M:%S')
 def humanize_size(size,precision=1):
    if size==None:
        return 'nan'
@@ -414,11 +429,13 @@ def humanize_size(size,precision=1):
        defPrecision=precision
    return "%.*f%s"%(defPrecision,size,suffixes[suffixIndex])
 def is_listed(db,filename):
    db.execute("SELECT COUNT(*) FROM list where file == ?",(filename,))
    count=db.fetchall()
    return count[0][0]>0
 def matchdb(sqlfile,needle,helper):
    needle=needle.lower()
    import difflib as dl
@@ -441,20 +458,27 @@ def matchdb(sqlfile,needle,helper):
            best_match=row[0]
    print(best_match)
-def print_structure(files):
+def print_duplicates(files):
    for hash in files:
        #print(hash[0])
        i=1
        for f in hash[1]:
-            print "%(i)d: %(x)d:%(f)s " % {'i':i, 'f':f[0], 'x':f[1]}
+            print("%(i)d|%(s)s|%(d)s|%(f)s " % {
                'i':i,
                'f':f[0],
                'd': humanize_date(f[2]),
                's': humanize_size(f[1])
            })
            i+=1
    return
 def print_stderr(s):
    sys.stderr.write(s)
    sys.stderr.write("\n")
    sys.stderr.flush()
 def searchdb(sqlfile,needle):
    needle=['%'+i+'%' for i in needle]
    like_query=' OR '.join(['file LIKE ?' for i in needle])
@@ -465,6 +489,18 @@ def searchdb(sqlfile,needle):
    for row in db:
        print(row[0])
 def sort_by_method(flist, order):
    if order == 'path':
        flist.sort(key=lambda file: file[0])
    if order == 'file':
        flist.sort(key=lambda file: os.path.basename(file[0]))
    if order == 'age':
        flist.sort(key=lambda file: file[2])
    if order == 'length':
        flist.sort(key=lambda file: len(file[0]))
 def stored_options(options):
    try:
        conn=sqlite3.connect(options.sqlfile)
@@ -483,6 +519,7 @@ def stored_options(options):
    return options
 def main():
    options=setup_options();
@@ -507,15 +544,15 @@ def main():
        disk_used(options)
        sys.exit(0)
    if options.delete:
-        print('Deleting entries...') 
+        print('Deleting entries...')
        delete_nonexisting(options.sqlfile,options)
    if options.add or options.changed:
        print('Adding '+options.startpath+' entries...')
        add_recurse(options)
    if options.duplicate:
-        files=find_duplicates(options.sqlfile)
+        files=find_duplicates(options.sqlfile, options.duplicate_order)
-        print_structure(files)
+        print_duplicates(files)
-       
+
    sys.exit(0)
 main()