From a04bb9f01e09e76d9dd79c564afe55064993032a Mon Sep 17 00:00:00 2001 From: Ville Rantanen Date: Fri, 28 Oct 2016 10:34:16 +0300 Subject: [PATCH] filelist with config, and relative path support --- files/file_list.py | 409 +++++++++++++++++++++++++-------------------- 1 file changed, 230 insertions(+), 179 deletions(-) diff --git a/files/file_list.py b/files/file_list.py index e4643f8..d0d6853 100755 --- a/files/file_list.py +++ b/files/file_list.py @@ -7,6 +7,7 @@ import subprocess import hashlib import magic from argparse import ArgumentParser +import ConfigParser,StringIO,io SQLFILE='list_of_files.sqlite' IMGMATCH=re.compile('.*\.jpg$|.*\.jpeg$|.*\.png$',re.I) @@ -32,8 +33,6 @@ def setup_options(): help="Depth of summarization for --du.") parser.add_argument("--dup",action="store_true",dest="duplicate",default=False, help="Return a list of duplicate files, based on hashes. This option will flip the 'Add new files' option. [%(default)s]") - parser.add_argument("--full",action="store_true",dest="fullfile",default=False, - help="Use full files to calculate md5 checksum. Defaults to first 50Mb. [%(default)s]") parser.add_argument("--haschanges",action="store_true",dest="haschanges",default=False, help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.") parser.add_argument("--hasdeletions",action="store_true",dest="hasdeletions",default=False, @@ -50,6 +49,10 @@ def setup_options(): help="Search list based on path pattern") parser.add_argument("-x",action="append",dest="exclude",default=[], help="Exclude folder name from the lists. This option may be issued several times") + parser.add_argument("--full",action="store_true",dest="fullfile",default=False, + help="ONLY FOR NEW DB CREATION. Use full files to calculate md5 checksum. Defaults to first 50Mb. [%(default)s]") + parser.add_argument("--relative",action="store_true",dest="relative",default=False, + help="ONLY FOR NEW DB CREATION. Store filenames relative to database file.") parser.add_argument('startpath', action="store",default='.', nargs='?') options=parser.parse_args() @@ -57,104 +60,9 @@ def setup_options(): if options.duplicate: options.add=not options.add options.startpath=unicode(options.startpath, "UTF-8") + options.sqlpath=os.path.dirname(os.path.realpath(options.sqlfile)) return options -def createdb(fname): - conn=sqlite3.connect(fname) - db=conn.cursor() - conn.text_factory=str - db.execute('CREATE TABLE list (id INTEGER PRIMARY KEY AUTOINCREMENT,\ - file TEXT,date INTEGER, hash TEXT,\ - size INTEGER, mime TEXT)') - conn.commit() - return - -def delete_nonexisting(sqlfile,options): - conn=sqlite3.connect(sqlfile) - conn.text_factory=str - db=conn.cursor() - dbdel=conn.cursor() - db.execute('SELECT file FROM list') - for row in db: - if os.path.exists(row[0]): - delete=False - if not options.symlinks: - if os.path.islink(row[0]): - delete=True - else: - delete=True - if delete: - print('removing.. '+row[0]) - dbdel.execute("DELETE FROM list where file == ?",(row[0],)) - conn.commit() - return - -def disk_used(options): - conn=sqlite3.connect(options.sqlfile) - conn.text_factory=str - db=conn.cursor() - db.execute('SELECT size,replace(file,?,"") as path FROM list WHERE file LIKE ?', - (os.path.realpath(options.diskused)+"/", - os.path.realpath(options.diskused)+"%", - )) - entries=[] - sizes=[] - for row in db: - start_path=row[1].split('/') - start_path="/".join(start_path[0:int(options.diskused_depth)]) - if start_path not in entries: - entries.append(start_path) - sizes.append(row[0]) - else: - sizes[ entries.index(start_path) ]+=row[0] - for entry in zip(sizes,entries): - print("| ".join([ str(entry[0]).ljust(14), - humanize_size(entry[0]).rjust(8), - entry[1]])) - -def has_changes(options): - conn=sqlite3.connect(options.sqlfile) - conn.text_factory=str - db=conn.cursor() - if options.haschanges: - options.changed=True - if options.hasdeletions or options.haschanges: - has_changes_deleted(db) - if options.hasadditions or options.haschanges: - has_changes_additions(db,options) - -def has_changes_deleted(db): - db.execute('SELECT file FROM list') - for row in db: - if not os.path.exists(row[0]): - print('True') - sys.exit(1) - return - -def has_changes_additions(db,options): - for path,dirs,files in os.walk(options.startpath,followlinks=options.symlinks): - dirs=clean_dirs(dirs) - db_files=get_folder_contents(db,os.path.realpath(path)+'/') - if not options.symlinks: - files=clean_syms(files,path) - for file in files: - filename=os.path.realpath(os.path.join(path,file)) - if file==options.sqlfile: - continue - #if not is_listed(db,filename): - if file not in db_files: - print('True') - sys.exit(1) - else: - if options.changed: - ftime=os.path.getmtime(filename) - if not ftime_match(db,filename,ftime): - #file content changed - print('True') - sys.exit(1) - - return - def add_recurse(options): conn=sqlite3.connect(options.sqlfile) conn.text_factory=str @@ -163,11 +71,11 @@ def add_recurse(options): dirs=clean_dirs(dirs) dirs.sort() files.sort() - db_files=get_folder_contents(db,os.path.realpath(path)+'/') + db_files=get_folder_contents(db,filename_join(path,"",options)+"/") if not options.symlinks: files=clean_syms(files,path) for file in files: - filename=os.path.realpath(os.path.join(path,file)) + filename=filename_join(path,file,options) if file==options.sqlfile: continue if not os.path.isfile(filename): @@ -211,61 +119,24 @@ def add_single(conn,filename,change=False,hash=None,minsize=0,fullfile=False): VALUES(?,?,?,?,?)",(filename,ftime,hash,fsize,mime)) return -def is_listed(db,filename): - db.execute("SELECT COUNT(*) FROM list where file == ?",(filename,)) - count=db.fetchall() - return count[0][0]>0 - -def get_folder_contents(db,path): - ''' return the contents of the folder ''' - files=[] - db.execute("SELECT file FROM list where file LIKE ?",(path+'%',)) +def checkdb(sqlfile,fullFile,needle): + if len(needle)==0: + needle.append('%') + needle=['%'+i+'%' for i in needle] + like_query=' OR '.join(['file LIKE ?' for i in needle]) + conn=sqlite3.connect(sqlfile) + conn.text_factory=str + db=conn.cursor() + db.execute("SELECT file,hash FROM list WHERE "+like_query+" ORDER BY file",needle) for row in db: - try: - base=row[0].decode('utf-8').replace(path,'',1) - except UnicodeDecodeError: - print(row[0]+" is giving me trouble.") - try: - base=row[0].encode('utf-8').replace(path,'',1) - except UnicodeDecodeError: - print(row[0]+" is still giving me trouble.") - sys.exit(1) - if base.find('/')==-1: - files.append(base) - return files - -def ftime_match(db,filename,ftime): - db.execute("SELECT date FROM list where file == ?",(filename,)) - count=db.fetchall() - return count[0][0]==ftime - -def hash_match(db,filename,hash): - db.execute("SELECT hash FROM list where file == ?",(filename,)) - count=db.fetchall() - return count[0][0]==hash - -def humanize_size(size,precision=1): - if size==None: - return 'nan' - suffixes=['B','KB','MB','GB','TB'] - suffixIndex = 0 - defPrecision=0 - while size > 1024: - suffixIndex += 1 #increment the index of the suffix - size = size/1024.0 #apply the division - defPrecision=precision - return "%.*f%s"%(defPrecision,size,suffixes[suffixIndex]) - -def get_md5(filename,fullfile=False): - ''' returns content based hash, only first 50Mb is read, unless user wants the whole file ''' - if fullfile: - block_size=2**20 - md5 = hashlib.md5() - with open(filename,'rb') as f: - for chunk in iter(lambda: f.read(block_size), b''): - md5.update(chunk) - return md5.hexdigest() - return hashlib.md5(open(filename,'rb').read(1024*1024*50)).hexdigest() + status='OK' + if os.path.exists(row[0]): + md5f=get_md5(row[0],fullFile) + if row[1]!=md5f: + status='Checksum-difference' + else: + status='Not-found' + print("%s %s"%(row[0],status)) def clean_dirs(dirs): for s in dirs[:]: @@ -280,6 +151,79 @@ def clean_syms(files,path): nonsyms.append(f) return nonsyms +def createdb(options): + conn=sqlite3.connect(options.sqlfile) + db=conn.cursor() + conn.text_factory=str + db.execute('CREATE TABLE list (id INTEGER PRIMARY KEY AUTOINCREMENT,\ + file TEXT,date INTEGER, hash TEXT,\ + size INTEGER, mime TEXT)') + db.execute('CREATE TABLE config (id INTEGER PRIMARY KEY AUTOINCREMENT,\ + object TEXT)') + conn.commit() + + config = ConfigParser.RawConfigParser() + config.add_section("General") + config.set("General","Relative",str(options.relative)) + config.set("General","FullFile",str(options.fullfile)) + store=StringIO.StringIO() + config.write(store) + db.execute("INSERT INTO config (object) values (?)",(store.getvalue(),)) + conn.commit() + return + +def delete_nonexisting(sqlfile,options): + conn=sqlite3.connect(sqlfile) + conn.text_factory=str + db=conn.cursor() + dbdel=conn.cursor() + db.execute('SELECT file FROM list') + for row in db: + if os.path.exists(row[0]): + delete=False + if not options.symlinks: + if os.path.islink(row[0]): + delete=True + else: + delete=True + if delete: + print('removing.. '+row[0]) + dbdel.execute("DELETE FROM list where file == ?",(row[0],)) + conn.commit() + return + +def disk_used(options): + conn=sqlite3.connect(options.sqlfile) + conn.text_factory=str + db=conn.cursor() + checkpath=filename_join(options.diskused,"",options)+"/" + if checkpath=="./": + checkpath="" + db.execute('SELECT size,replace(file,?,"") as path FROM list WHERE file LIKE ?', + (checkpath, + checkpath+"%", + )) + entries=[] + sizes=[] + for row in db: + start_path=row[1].split('/') + start_path="/".join(start_path[0:int(options.diskused_depth)]) + if start_path not in entries: + entries.append(start_path) + sizes.append(row[0]) + else: + sizes[ entries.index(start_path) ]+=row[0] + for entry in zip(sizes,entries): + print("| ".join([ str(entry[0]).ljust(14), + humanize_size(entry[0]).rjust(8), + entry[1]])) + +def filename_join(path,name,options): + filename=os.path.realpath(os.path.join(path,name)) + if options.relative: + return os.path.relpath(filename, options.sqlpath) + return filename + def find_duplicates(sqlfile): conn=sqlite3.connect(sqlfile) conn.text_factory=str @@ -298,34 +242,110 @@ def find_duplicates(sqlfile): duphash.sort(key=lambda file: file[1][0]) return duphash -def searchdb(sqlfile,needle): - needle=['%'+i+'%' for i in needle] - like_query=' OR '.join(['file LIKE ?' for i in needle]) - conn=sqlite3.connect(sqlfile) - conn.text_factory=str - db=conn.cursor() - db.execute("SELECT file FROM list WHERE "+like_query+" ORDER BY file",needle) - for row in db: - print(row[0]) +def ftime_match(db,filename,ftime): + db.execute("SELECT date FROM list where file == ?",(filename,)) + count=db.fetchall() + return count[0][0]==ftime -def checkdb(sqlfile,fullFile,needle): - if len(needle)==0: - needle.append('%') - needle=['%'+i+'%' for i in needle] - like_query=' OR '.join(['file LIKE ?' for i in needle]) - conn=sqlite3.connect(sqlfile) +def get_folder_contents(db,path): + ''' return the contents of the folder ''' + files=[] + if path=="./": + db.execute("SELECT file FROM list where file NOT LIKE ?",('%/%',)) + path="" + else: + db.execute("SELECT file FROM list where file LIKE ?",(path+'%',)) + for row in db: + try: + base=row[0].decode('utf-8').replace(path,'',1) + except UnicodeDecodeError: + print(row[0]+" is giving me trouble.") + try: + base=row[0].encode('utf-8').replace(path,'',1) + except UnicodeDecodeError: + print(row[0]+" is still giving me trouble.") + sys.exit(1) + if base.find('/')==-1: + files.append(base) + return files + + +def get_md5(filename,fullfile=False): + ''' returns content based hash, only first 50Mb is read, unless user wants the whole file ''' + if fullfile: + block_size=2**20 + md5 = hashlib.md5() + with open(filename,'rb') as f: + for chunk in iter(lambda: f.read(block_size), b''): + md5.update(chunk) + return md5.hexdigest() + return hashlib.md5(open(filename,'rb').read(1024*1024*50)).hexdigest() + + +def has_changes(options): + conn=sqlite3.connect(options.sqlfile) conn.text_factory=str db=conn.cursor() - db.execute("SELECT file,hash FROM list WHERE "+like_query+" ORDER BY file",needle) + if options.haschanges: + options.changed=True + if options.hasdeletions or options.haschanges: + has_changes_deleted(db) + if options.hasadditions or options.haschanges: + has_changes_additions(db,options) + +def has_changes_deleted(db): + db.execute('SELECT file FROM list') for row in db: - status='OK' - if os.path.exists(row[0]): - md5f=get_md5(row[0],fullFile) - if row[1]!=md5f: - status='Checksum-difference' - else: - status='Not-found' - print("%s %s"%(row[0],status)) + if not os.path.exists(row[0]): + print('True') + sys.exit(1) + return + +def has_changes_additions(db,options): + for path,dirs,files in os.walk(options.startpath,followlinks=options.symlinks): + dirs=clean_dirs(dirs) + db_files=get_folder_contents(db,filename_join(path,"",options)+"/") + if not options.symlinks: + files=clean_syms(files,path) + for file in files: + filename=filename_join(path,file,options) + if file==options.sqlfile: + continue + #if not is_listed(db,filename): + if file not in db_files: + print('True') + sys.exit(1) + else: + if options.changed: + ftime=os.path.getmtime(filename) + if not ftime_match(db,filename,ftime): + #file content changed + print('True') + sys.exit(1) + + return + +#~ def hash_match(db,filename,hash): + #~ db.execute("SELECT hash FROM list where file == ?",(filename,)) + #~ count=db.fetchall() + #~ return count[0][0]==hash + +def humanize_size(size,precision=1): + if size==None: + return 'nan' + suffixes=['B','KB','MB','GB','TB'] + suffixIndex = 0 + defPrecision=0 + while size > 1024: + suffixIndex += 1 #increment the index of the suffix + size = size/1024.0 #apply the division + defPrecision=precision + return "%.*f%s"%(defPrecision,size,suffixes[suffixIndex]) + +def is_listed(db,filename): + db.execute("SELECT COUNT(*) FROM list where file == ?",(filename,)) + count=db.fetchall() + return count[0][0]>0 def matchdb(sqlfile,needle,helper): needle=needle.lower() @@ -358,11 +378,42 @@ def print_structure(files): i+=1 return +def searchdb(sqlfile,needle): + needle=['%'+i+'%' for i in needle] + like_query=' OR '.join(['file LIKE ?' for i in needle]) + conn=sqlite3.connect(sqlfile) + conn.text_factory=str + db=conn.cursor() + db.execute("SELECT file FROM list WHERE "+like_query+" ORDER BY file",needle) + for row in db: + print(row[0]) + +def stored_options(options): + try: + conn=sqlite3.connect(options.sqlfile) + db=conn.cursor() + conn.text_factory=str + db.execute("SELECT object FROM config") + store="" + for row in db: + store+=row[0]+'\n' + config = ConfigParser.RawConfigParser() + config.readfp(io.BytesIO(store)) + options.relative=config.getboolean("General","Relative") + options.fullfile=config.getboolean("General","FullFile") + except: + pass + + return options + def main(): options=setup_options(); if not os.path.exists(options.sqlfile): - createdb(options.sqlfile); + createdb(options); + options=stored_options(options) + if options.relative: + os.chdir(options.sqlpath) if options.haschanges or options.hasadditions or options.hasdeletions: has_changes(options) sys.exit(0)