ordering methods for duplicates
This commit is contained in:
@@ -7,7 +7,7 @@ import sqlite3
|
|||||||
import subprocess
|
import subprocess
|
||||||
import hashlib
|
import hashlib
|
||||||
import magic
|
import magic
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
import ConfigParser,StringIO,io
|
import ConfigParser,StringIO,io
|
||||||
import datetime
|
import datetime
|
||||||
|
|
||||||
@@ -37,6 +37,10 @@ def setup_options():
|
|||||||
help="Depth of summarization for --du.")
|
help="Depth of summarization for --du.")
|
||||||
parser.add_argument("--dup",action="store_true",dest="duplicate",default=False,
|
parser.add_argument("--dup",action="store_true",dest="duplicate",default=False,
|
||||||
help="Return a list of duplicate files, based on hashes. This option will flip the 'Add new files' option. [%(default)s]")
|
help="Return a list of duplicate files, based on hashes. This option will flip the 'Add new files' option. [%(default)s]")
|
||||||
|
parser.add_argument("--dup-order",action="store",dest="duplicate_order",default='path',
|
||||||
|
help = "Order duplicates by a method. (length = path str length)",
|
||||||
|
choices = ('age','length','file','path')
|
||||||
|
)
|
||||||
parser.add_argument("--haschanges",action="store_true",dest="haschanges",default=False,
|
parser.add_argument("--haschanges",action="store_true",dest="haschanges",default=False,
|
||||||
help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.")
|
help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.")
|
||||||
parser.add_argument("--hasdeletions",action="store_true",dest="hasdeletions",default=False,
|
parser.add_argument("--hasdeletions",action="store_true",dest="hasdeletions",default=False,
|
||||||
@@ -67,6 +71,7 @@ def setup_options():
|
|||||||
options.sqlpath=os.path.dirname(os.path.realpath(options.sqlfile))
|
options.sqlpath=os.path.dirname(os.path.realpath(options.sqlfile))
|
||||||
return options
|
return options
|
||||||
|
|
||||||
|
|
||||||
def add_recurse(options):
|
def add_recurse(options):
|
||||||
conn=sqlite3.connect(options.sqlfile)
|
conn=sqlite3.connect(options.sqlfile)
|
||||||
conn.text_factory=str
|
conn.text_factory=str
|
||||||
@@ -101,6 +106,7 @@ def add_recurse(options):
|
|||||||
sys.stdout.write("\n")
|
sys.stdout.write("\n")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
def add_single(conn,filename,change=False,hash=None,minsize=0,fullfile=False):
|
def add_single(conn,filename,change=False,hash=None,minsize=0,fullfile=False):
|
||||||
try:
|
try:
|
||||||
fsize=os.path.getsize(filename)
|
fsize=os.path.getsize(filename)
|
||||||
@@ -119,7 +125,7 @@ def add_single(conn,filename,change=False,hash=None,minsize=0,fullfile=False):
|
|||||||
return
|
return
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
mime="NA"
|
mime="NA"
|
||||||
|
|
||||||
if change:
|
if change:
|
||||||
db.execute("UPDATE list SET date=?, hash=?, size=?, mime=? \
|
db.execute("UPDATE list SET date=?, hash=?, size=?, mime=? \
|
||||||
WHERE file=?",(ftime,hash,fsize,mime,filename))
|
WHERE file=?",(ftime,hash,fsize,mime,filename))
|
||||||
@@ -130,6 +136,7 @@ def add_single(conn,filename,change=False,hash=None,minsize=0,fullfile=False):
|
|||||||
sys.stdout.write('\r')
|
sys.stdout.write('\r')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
def checkdb(options):
|
def checkdb(options):
|
||||||
needle=options.search
|
needle=options.search
|
||||||
if len(needle)==0:
|
if len(needle)==0:
|
||||||
@@ -177,14 +184,14 @@ def checkdb(options):
|
|||||||
print(("%-"+pad+"s (%s %7s)")%(f,
|
print(("%-"+pad+"s (%s %7s)")%(f,
|
||||||
humanize_date(os.path.getmtime(f)),
|
humanize_date(os.path.getmtime(f)),
|
||||||
humanize_size(os.path.getsize(f))))
|
humanize_size(os.path.getsize(f))))
|
||||||
|
|
||||||
print("----\nFile check summary:")
|
print("----\nFile check summary:")
|
||||||
print("Database modified: %s"%(humanize_date(os.path.getmtime(options.sqlfile)),))
|
print("Database modified: %s"%(humanize_date(os.path.getmtime(options.sqlfile)),))
|
||||||
print("Checksum matches : %d"%(OK_count,))
|
print("Checksum matches : %d"%(OK_count,))
|
||||||
print("Checksum mismatch: %d"%(len(differing),))
|
print("Checksum mismatch: %d"%(len(differing),))
|
||||||
print("Files missing : %d"%(len(missing),))
|
print("Files missing : %d"%(len(missing),))
|
||||||
print("Files added : %d"%(len(added),))
|
print("Files added : %d"%(len(added),))
|
||||||
|
|
||||||
|
|
||||||
def clean_dirs(dirs):
|
def clean_dirs(dirs):
|
||||||
for s in dirs[:]:
|
for s in dirs[:]:
|
||||||
@@ -192,6 +199,7 @@ def clean_dirs(dirs):
|
|||||||
dirs.remove(s)
|
dirs.remove(s)
|
||||||
return dirs
|
return dirs
|
||||||
|
|
||||||
|
|
||||||
def clean_syms(files,path):
|
def clean_syms(files,path):
|
||||||
nonsyms=[]
|
nonsyms=[]
|
||||||
for f in files:
|
for f in files:
|
||||||
@@ -208,8 +216,8 @@ def createdb(options):
|
|||||||
size INTEGER, mime TEXT)')
|
size INTEGER, mime TEXT)')
|
||||||
db.execute('CREATE TABLE config (id INTEGER PRIMARY KEY AUTOINCREMENT,\
|
db.execute('CREATE TABLE config (id INTEGER PRIMARY KEY AUTOINCREMENT,\
|
||||||
object TEXT)')
|
object TEXT)')
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
config = ConfigParser.RawConfigParser()
|
config = ConfigParser.RawConfigParser()
|
||||||
config.add_section("General")
|
config.add_section("General")
|
||||||
config.set("General","Relative",str(options.relative))
|
config.set("General","Relative",str(options.relative))
|
||||||
@@ -217,9 +225,10 @@ def createdb(options):
|
|||||||
store=StringIO.StringIO()
|
store=StringIO.StringIO()
|
||||||
config.write(store)
|
config.write(store)
|
||||||
db.execute("INSERT INTO config (object) values (?)",(store.getvalue(),))
|
db.execute("INSERT INTO config (object) values (?)",(store.getvalue(),))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
def delete_nonexisting(sqlfile,options):
|
def delete_nonexisting(sqlfile,options):
|
||||||
conn=sqlite3.connect(sqlfile)
|
conn=sqlite3.connect(sqlfile)
|
||||||
conn.text_factory=str
|
conn.text_factory=str
|
||||||
@@ -236,10 +245,11 @@ def delete_nonexisting(sqlfile,options):
|
|||||||
delete=True
|
delete=True
|
||||||
if delete:
|
if delete:
|
||||||
print('removing.. '+row[0])
|
print('removing.. '+row[0])
|
||||||
dbdel.execute("DELETE FROM list where file == ?",(row[0],))
|
dbdel.execute("DELETE FROM list where file == ?",(row[0],))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
def disk_used(options):
|
def disk_used(options):
|
||||||
conn=sqlite3.connect(options.sqlfile)
|
conn=sqlite3.connect(options.sqlfile)
|
||||||
conn.text_factory=str
|
conn.text_factory=str
|
||||||
@@ -262,17 +272,19 @@ def disk_used(options):
|
|||||||
else:
|
else:
|
||||||
sizes[ entries.index(start_path) ]+=row[0]
|
sizes[ entries.index(start_path) ]+=row[0]
|
||||||
for entry in zip(sizes,entries):
|
for entry in zip(sizes,entries):
|
||||||
print("| ".join([ str(entry[0]).ljust(14),
|
print("| ".join([ str(entry[0]).ljust(14),
|
||||||
humanize_size(entry[0]).rjust(8),
|
humanize_size(entry[0]).rjust(8),
|
||||||
entry[1]]))
|
entry[1]]))
|
||||||
|
|
||||||
|
|
||||||
def filename_join(path,name,options):
|
def filename_join(path,name,options):
|
||||||
filename=os.path.realpath(os.path.join(path,name))
|
filename=os.path.realpath(os.path.join(path,name))
|
||||||
if options.relative:
|
if options.relative:
|
||||||
return os.path.relpath(filename, options.sqlpath)
|
return os.path.relpath(filename, options.sqlpath)
|
||||||
return filename
|
return filename
|
||||||
|
|
||||||
def find_duplicates(sqlfile):
|
|
||||||
|
def find_duplicates(sqlfile, order):
|
||||||
conn=sqlite3.connect(sqlfile)
|
conn=sqlite3.connect(sqlfile)
|
||||||
conn.text_factory=str
|
conn.text_factory=str
|
||||||
db=conn.cursor()
|
db=conn.cursor()
|
||||||
@@ -285,7 +297,7 @@ def find_duplicates(sqlfile):
|
|||||||
flist=[]
|
flist=[]
|
||||||
for row in dbh:
|
for row in dbh:
|
||||||
flist.append(row)
|
flist.append(row)
|
||||||
flist.sort(key=lambda file: file[0])
|
sort_by_method(flist, order)
|
||||||
duphash.append((hash, flist))
|
duphash.append((hash, flist))
|
||||||
duphash.sort(key=lambda file: file[1][0])
|
duphash.sort(key=lambda file: file[1][0])
|
||||||
return duphash
|
return duphash
|
||||||
@@ -317,6 +329,7 @@ def get_folder_contents(db,path):
|
|||||||
files.append(base)
|
files.append(base)
|
||||||
return files
|
return files
|
||||||
|
|
||||||
|
|
||||||
def get_md5(filename,fullfile=False):
|
def get_md5(filename,fullfile=False):
|
||||||
''' returns content based hash, only first 50Mb is read, unless user wants the whole file '''
|
''' returns content based hash, only first 50Mb is read, unless user wants the whole file '''
|
||||||
fsize=os.path.getsize(filename)
|
fsize=os.path.getsize(filename)
|
||||||
@@ -326,8 +339,8 @@ def get_md5(filename,fullfile=False):
|
|||||||
block_size=2**24
|
block_size=2**24
|
||||||
percents_per_block=100/(float(fsize)/block_size)
|
percents_per_block=100/(float(fsize)/block_size)
|
||||||
md5 = hashlib.md5()
|
md5 = hashlib.md5()
|
||||||
with open(filename,'rb') as f:
|
with open(filename,'rb') as f:
|
||||||
for chunk in iter(lambda: f.read(block_size), b''):
|
for chunk in iter(lambda: f.read(block_size), b''):
|
||||||
sys.stderr.write('\r %s (%02d%%)'%(ANIM[anim_i%anim_len],int(anim_i*percents_per_block)))
|
sys.stderr.write('\r %s (%02d%%)'%(ANIM[anim_i%anim_len],int(anim_i*percents_per_block)))
|
||||||
sys.stderr.flush()
|
sys.stderr.flush()
|
||||||
anim_i+=1
|
anim_i+=1
|
||||||
@@ -347,7 +360,7 @@ def has_changes(options):
|
|||||||
has_changes_deleted(db)
|
has_changes_deleted(db)
|
||||||
if options.hasadditions or options.haschanges:
|
if options.hasadditions or options.haschanges:
|
||||||
has_changes_additions(db,options)
|
has_changes_additions(db,options)
|
||||||
|
|
||||||
def has_changes_deleted(db,exit=True):
|
def has_changes_deleted(db,exit=True):
|
||||||
db.execute('SELECT file FROM list')
|
db.execute('SELECT file FROM list')
|
||||||
deleted=[]
|
deleted=[]
|
||||||
@@ -360,6 +373,7 @@ def has_changes_deleted(db,exit=True):
|
|||||||
deleted.append(row[0])
|
deleted.append(row[0])
|
||||||
return deleted
|
return deleted
|
||||||
|
|
||||||
|
|
||||||
def has_changes_additions(db,options,exit=True):
|
def has_changes_additions(db,options,exit=True):
|
||||||
added=[]
|
added=[]
|
||||||
changed=[]
|
changed=[]
|
||||||
@@ -389,7 +403,7 @@ def has_changes_additions(db,options,exit=True):
|
|||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
else:
|
else:
|
||||||
changed.append(filename)
|
changed.append(filename)
|
||||||
|
|
||||||
return (added,changed)
|
return (added,changed)
|
||||||
|
|
||||||
#~ def hash_match(db,filename,hash):
|
#~ def hash_match(db,filename,hash):
|
||||||
@@ -402,6 +416,7 @@ def humanize_date(date):
|
|||||||
return ''
|
return ''
|
||||||
return datetime.datetime.fromtimestamp(int(date)).strftime('%Y-%m-%d %H:%M:%S')
|
return datetime.datetime.fromtimestamp(int(date)).strftime('%Y-%m-%d %H:%M:%S')
|
||||||
|
|
||||||
|
|
||||||
def humanize_size(size,precision=1):
|
def humanize_size(size,precision=1):
|
||||||
if size==None:
|
if size==None:
|
||||||
return 'nan'
|
return 'nan'
|
||||||
@@ -414,11 +429,13 @@ def humanize_size(size,precision=1):
|
|||||||
defPrecision=precision
|
defPrecision=precision
|
||||||
return "%.*f%s"%(defPrecision,size,suffixes[suffixIndex])
|
return "%.*f%s"%(defPrecision,size,suffixes[suffixIndex])
|
||||||
|
|
||||||
|
|
||||||
def is_listed(db,filename):
|
def is_listed(db,filename):
|
||||||
db.execute("SELECT COUNT(*) FROM list where file == ?",(filename,))
|
db.execute("SELECT COUNT(*) FROM list where file == ?",(filename,))
|
||||||
count=db.fetchall()
|
count=db.fetchall()
|
||||||
return count[0][0]>0
|
return count[0][0]>0
|
||||||
|
|
||||||
|
|
||||||
def matchdb(sqlfile,needle,helper):
|
def matchdb(sqlfile,needle,helper):
|
||||||
needle=needle.lower()
|
needle=needle.lower()
|
||||||
import difflib as dl
|
import difflib as dl
|
||||||
@@ -441,20 +458,27 @@ def matchdb(sqlfile,needle,helper):
|
|||||||
best_match=row[0]
|
best_match=row[0]
|
||||||
print(best_match)
|
print(best_match)
|
||||||
|
|
||||||
def print_structure(files):
|
def print_duplicates(files):
|
||||||
for hash in files:
|
for hash in files:
|
||||||
#print(hash[0])
|
#print(hash[0])
|
||||||
i=1
|
i=1
|
||||||
for f in hash[1]:
|
for f in hash[1]:
|
||||||
print "%(i)d: %(x)d:%(f)s " % {'i':i, 'f':f[0], 'x':f[1]}
|
print("%(i)d|%(s)s|%(d)s|%(f)s " % {
|
||||||
|
'i':i,
|
||||||
|
'f':f[0],
|
||||||
|
'd': humanize_date(f[2]),
|
||||||
|
's': humanize_size(f[1])
|
||||||
|
})
|
||||||
i+=1
|
i+=1
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
def print_stderr(s):
|
def print_stderr(s):
|
||||||
sys.stderr.write(s)
|
sys.stderr.write(s)
|
||||||
sys.stderr.write("\n")
|
sys.stderr.write("\n")
|
||||||
sys.stderr.flush()
|
sys.stderr.flush()
|
||||||
|
|
||||||
|
|
||||||
def searchdb(sqlfile,needle):
|
def searchdb(sqlfile,needle):
|
||||||
needle=['%'+i+'%' for i in needle]
|
needle=['%'+i+'%' for i in needle]
|
||||||
like_query=' OR '.join(['file LIKE ?' for i in needle])
|
like_query=' OR '.join(['file LIKE ?' for i in needle])
|
||||||
@@ -465,6 +489,18 @@ def searchdb(sqlfile,needle):
|
|||||||
for row in db:
|
for row in db:
|
||||||
print(row[0])
|
print(row[0])
|
||||||
|
|
||||||
|
|
||||||
|
def sort_by_method(flist, order):
|
||||||
|
if order == 'path':
|
||||||
|
flist.sort(key=lambda file: file[0])
|
||||||
|
if order == 'file':
|
||||||
|
flist.sort(key=lambda file: os.path.basename(file[0]))
|
||||||
|
if order == 'age':
|
||||||
|
flist.sort(key=lambda file: file[2])
|
||||||
|
if order == 'length':
|
||||||
|
flist.sort(key=lambda file: len(file[0]))
|
||||||
|
|
||||||
|
|
||||||
def stored_options(options):
|
def stored_options(options):
|
||||||
try:
|
try:
|
||||||
conn=sqlite3.connect(options.sqlfile)
|
conn=sqlite3.connect(options.sqlfile)
|
||||||
@@ -483,6 +519,7 @@ def stored_options(options):
|
|||||||
|
|
||||||
return options
|
return options
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
options=setup_options();
|
options=setup_options();
|
||||||
|
|
||||||
@@ -507,15 +544,15 @@ def main():
|
|||||||
disk_used(options)
|
disk_used(options)
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
if options.delete:
|
if options.delete:
|
||||||
print('Deleting entries...')
|
print('Deleting entries...')
|
||||||
delete_nonexisting(options.sqlfile,options)
|
delete_nonexisting(options.sqlfile,options)
|
||||||
if options.add or options.changed:
|
if options.add or options.changed:
|
||||||
print('Adding '+options.startpath+' entries...')
|
print('Adding '+options.startpath+' entries...')
|
||||||
add_recurse(options)
|
add_recurse(options)
|
||||||
if options.duplicate:
|
if options.duplicate:
|
||||||
files=find_duplicates(options.sqlfile)
|
files=find_duplicates(options.sqlfile, options.duplicate_order)
|
||||||
print_structure(files)
|
print_duplicates(files)
|
||||||
|
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
||||||
main()
|
main()
|
||||||
|
|||||||
Reference in New Issue
Block a user