From 3e449e042b30912635105145b8ef6a639ac2eda9 Mon Sep 17 00:00:00 2001 From: ville rantanen Date: Mon, 19 Jan 2015 15:52:23 +0200 Subject: [PATCH] full file read for file list --- file_list.py | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/file_list.py b/file_list.py index d254e68..c6962cf 100755 --- a/file_list.py +++ b/file_list.py @@ -28,24 +28,26 @@ def setup_options(): help="Print directory sizes. Argument is the path where directories are listed from.") parser.add_argument("--du-depth",type=str,action='store',dest="diskused_depth",default=1, help="Depth of summarization for --du.") + parser.add_argument("--dup",action="store_true",dest="duplicate",default=False, + help="Return a list of duplicate files, based on hashes. This option will flip the 'Add new files' option. [%(default)s]") + parser.add_argument("--full",action="store_true",dest="fullfile",default=False, + help="Use full files to calculate md5 checksum. Defaults to first 50Mb. [%(default)s]") parser.add_argument("--haschanges",action="store_true",dest="haschanges",default=False, help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.") parser.add_argument("--hasdeletions",action="store_true",dest="hasdeletions",default=False, help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.") parser.add_argument("--hasadditions",action="store_true",dest="hasadditions",default=False, help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.") - parser.add_argument("-l",action="store_true",dest="symlinks",default=False, - help="Follow symbolic links [%(default)s]") - parser.add_argument("--dup",action="store_true",dest="duplicate",default=False, - help="Return a list of duplicate files, based on hashes. This option will flip the 'Add new files' option. [%(default)s]") - parser.add_argument("-x",action="append",dest="exclude",default=[], - help="Exclude folder name from the lists. This option may be issued several times") parser.add_argument("-f",action="store",dest="sqlfile",default=SQLFILE, help="SQL file name to use [%(default)s]") - parser.add_argument("-s",type=str,action='append',dest="search",default=[], - help="Search list based on path pattern") + parser.add_argument("-l",action="store_true",dest="symlinks",default=False, + help="Follow symbolic links [%(default)s]") parser.add_argument("--match",type=str,dest="match",default=False, help="Search for closest match from basenames, can be helped with adding -s") + parser.add_argument("-s",type=str,action='append',dest="search",default=[], + help="Search list based on path pattern") + parser.add_argument("-x",action="append",dest="exclude",default=[], + help="Exclude folder name from the lists. This option may be issued several times") parser.add_argument('startpath', action="store",default='.', nargs='?') options=parser.parse_args() @@ -169,24 +171,24 @@ def add_recurse(options): #if not is_listed(db,filename): if file not in db_files: if options.add: - add_single(conn,filename,change=False) + add_single(conn,filename,change=False,fullfile=options.fullfile) else: if options.changed: ftime=os.path.getmtime(filename) if not ftime_match(db,filename,ftime): #file content changed - add_single(conn,filename,change=True) + add_single(conn,filename,change=True,fullfile=options.fullfile) conn.commit() return -def add_single(conn,filename,change=False,hash=None,minsize=0): +def add_single(conn,filename,change=False,hash=None,minsize=0,fullfile=False): print "%(f)s" % {'f':filename} db=conn.cursor() try: if hash==None: - hash=get_md5(filename) + hash=get_md5(filename,fullfile) ftime=os.path.getmtime(filename) fsize=os.path.getsize(filename) mime=MIME.file(filename.encode('UTF-8')) @@ -250,8 +252,15 @@ def humanize_size(size,precision=1): defPrecision=precision return "%.*f%s"%(defPrecision,size,suffixes[suffixIndex]) -def get_md5(filename): - ''' returns content based hash, only first 50Mb is read ''' +def get_md5(filename,fullfile=False): + ''' returns content based hash, only first 50Mb is read, unless user wants the whole file ''' + if fullfile: + block_size=2**20 + md5 = hashlib.md5() + with open(filename,'rb') as f: + for chunk in iter(lambda: f.read(block_size), b''): + md5.update(chunk) + return md5.hexdigest() return hashlib.md5(open(filename,'rb').read(1024*1024*50)).hexdigest() def clean_dirs(dirs):