From 3e449e042b30912635105145b8ef6a639ac2eda9 Mon Sep 17 00:00:00 2001
From: ville rantanen <ville.rantanen@helsinki.fi>
Date: Mon, 19 Jan 2015 15:52:23 +0200
Subject: [PATCH] full file read for file list

---
 file_list.py | 37 +++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/file_list.py b/file_list.py
index d254e68..c6962cf 100755
--- a/file_list.py
+++ b/file_list.py
@@ -28,24 +28,26 @@ def setup_options():
                       help="Print directory sizes. Argument is the path where directories are listed from.")
     parser.add_argument("--du-depth",type=str,action='store',dest="diskused_depth",default=1,
                       help="Depth of summarization for --du.")
+    parser.add_argument("--dup",action="store_true",dest="duplicate",default=False,
+                     help="Return a list of duplicate files, based on hashes. This option will flip the 'Add new files' option. [%(default)s]")
+    parser.add_argument("--full",action="store_true",dest="fullfile",default=False,
+                     help="Use full files to calculate md5 checksum. Defaults to first 50Mb. [%(default)s]")
     parser.add_argument("--haschanges",action="store_true",dest="haschanges",default=False,
                       help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.")
     parser.add_argument("--hasdeletions",action="store_true",dest="hasdeletions",default=False,
                       help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.")
     parser.add_argument("--hasadditions",action="store_true",dest="hasadditions",default=False,
                       help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.")
-    parser.add_argument("-l",action="store_true",dest="symlinks",default=False,
-                     help="Follow symbolic links [%(default)s]")
-    parser.add_argument("--dup",action="store_true",dest="duplicate",default=False,
-                     help="Return a list of duplicate files, based on hashes. This option will flip the 'Add new files' option. [%(default)s]")
-    parser.add_argument("-x",action="append",dest="exclude",default=[],
-                     help="Exclude folder name from the lists. This option may be issued several times")
     parser.add_argument("-f",action="store",dest="sqlfile",default=SQLFILE,
                       help="SQL file name to use [%(default)s]")
-    parser.add_argument("-s",type=str,action='append',dest="search",default=[],
-                      help="Search list based on path pattern")
+    parser.add_argument("-l",action="store_true",dest="symlinks",default=False,
+                     help="Follow symbolic links [%(default)s]")
     parser.add_argument("--match",type=str,dest="match",default=False,
                       help="Search for closest match from basenames, can be helped with adding -s")
+    parser.add_argument("-s",type=str,action='append',dest="search",default=[],
+                      help="Search list based on path pattern")
+    parser.add_argument("-x",action="append",dest="exclude",default=[],
+                     help="Exclude folder name from the lists. This option may be issued several times")
     parser.add_argument('startpath', action="store",default='.', nargs='?')
 
     options=parser.parse_args()
@@ -169,24 +171,24 @@ def add_recurse(options):
             #if not is_listed(db,filename):
             if file not in db_files:
                 if options.add:
-                    add_single(conn,filename,change=False)
+                    add_single(conn,filename,change=False,fullfile=options.fullfile)
             else:
                 if options.changed:
                     ftime=os.path.getmtime(filename)
                     if not ftime_match(db,filename,ftime):
                         #file content changed
-                        add_single(conn,filename,change=True)
+                        add_single(conn,filename,change=True,fullfile=options.fullfile)
         conn.commit()
                     
     return
 
-def add_single(conn,filename,change=False,hash=None,minsize=0):
+def add_single(conn,filename,change=False,hash=None,minsize=0,fullfile=False):
     
     print "%(f)s" % {'f':filename}
     db=conn.cursor()
     try:
         if hash==None:
-            hash=get_md5(filename)
+            hash=get_md5(filename,fullfile)
         ftime=os.path.getmtime(filename)
         fsize=os.path.getsize(filename)
         mime=MIME.file(filename.encode('UTF-8'))
@@ -250,8 +252,15 @@ def humanize_size(size,precision=1):
         defPrecision=precision
     return "%.*f%s"%(defPrecision,size,suffixes[suffixIndex])
 
-def get_md5(filename):
-    ''' returns content based hash, only first 50Mb is read '''
+def get_md5(filename,fullfile=False):
+    ''' returns content based hash, only first 50Mb is read, unless user wants the whole file '''
+    if fullfile:
+        block_size=2**20
+        md5 = hashlib.md5()
+        with open(filename,'rb') as f: 
+            for chunk in iter(lambda: f.read(block_size), b''): 
+                md5.update(chunk)
+        return md5.hexdigest()
     return hashlib.md5(open(filename,'rb').read(1024*1024*50)).hexdigest()
 
 def clean_dirs(dirs):