Files
q-tools/files/file_list.py
2017-01-07 18:07:26 +02:00

523 lines
19 KiB
Python
Executable File

#!/usr/bin/python
# -*- coding: latin-1 -*-
import sys
import os
import re
import sqlite3
import subprocess
import hashlib
import magic
from argparse import ArgumentParser
import ConfigParser,StringIO,io
import datetime
SQLFILE='list_of_files.sqlite'
IMGMATCH=re.compile('.*\.jpg$|.*\.jpeg$|.*\.png$',re.I)
BADDIRS=[]
MINSIZE=0
MIME=magic.open(magic.MAGIC_NONE)
#MIME=magic.open(magic.MAGIC_MIME)
MIME.load()
ANIM=['.','·',"'","'",'·','.','_']
DEFAULT_CHUNK=1024*1024*50
def setup_options():
parser=ArgumentParser(description="Maintains the list of images sqlite file")
parser.add_argument("-a",action="store_false",dest="add",default=True,
help="Do not add new files [%(default)s]")
parser.add_argument("-c",action="store_true",dest="changed",default=False,
help="Modify changed files [%(default)s]")
parser.add_argument("--check",action="store_true",dest="check",default=False,
help="Check md5sums of files. Limit check with -s.")
parser.add_argument("-d",action="store_true",dest="delete",default=False,
help="Delete non-existing entries [%(default)s]")
parser.add_argument("--du",type=str,action='store',dest="diskused",default=False,
help="Print directory sizes. Argument is the path where directories are listed from.")
parser.add_argument("--du-depth",type=str,action='store',dest="diskused_depth",default=1,
help="Depth of summarization for --du.")
parser.add_argument("--dup",action="store_true",dest="duplicate",default=False,
help="Return a list of duplicate files, based on hashes. This option will flip the 'Add new files' option. [%(default)s]")
parser.add_argument("--haschanges",action="store_true",dest="haschanges",default=False,
help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.")
parser.add_argument("--hasdeletions",action="store_true",dest="hasdeletions",default=False,
help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.")
parser.add_argument("--hasadditions",action="store_true",dest="hasadditions",default=False,
help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.")
parser.add_argument("-f",action="store",dest="sqlfile",default=SQLFILE,
help="SQL file name to use [%(default)s]")
parser.add_argument("-l",action="store_true",dest="symlinks",default=False,
help="Follow symbolic links [%(default)s]")
parser.add_argument("--match",type=str,dest="match",default=False,
help="Search for closest match from basenames, can be helped with adding -s")
parser.add_argument("-s",type=str,action='append',dest="search",default=[],
help="Search list based on path pattern")
parser.add_argument("-x",action="append",dest="exclude",default=[],
help="Exclude folder name from the lists. This option may be issued several times")
parser.add_argument("--full",action="store_true",dest="fullfile",default=False,
help="ONLY FOR NEW DB CREATION. Use full files to calculate md5 checksum. Defaults to first 50Mb. [%(default)s]")
parser.add_argument("--relative",action="store_true",dest="relative",default=False,
help="ONLY FOR NEW DB CREATION. Store filenames relative to database file.")
parser.add_argument('startpath', action="store",default='.', nargs='?')
options=parser.parse_args()
BADDIRS.extend(options.exclude)
if options.duplicate:
options.add=not options.add
options.startpath=unicode(options.startpath, "UTF-8")
options.sqlpath=os.path.dirname(os.path.realpath(options.sqlfile))
return options
def add_recurse(options):
conn=sqlite3.connect(options.sqlfile)
conn.text_factory=str
db=conn.cursor()
prev_path_len=0
for path,dirs,files in os.walk(options.startpath,followlinks=options.symlinks):
sys.stdout.write(("\r%s%s"%(filename_join(path,".",options),(prev_path_len-len(path))*' ')).encode('utf-8'))
prev_path_len=len(path)
dirs=clean_dirs(dirs)
dirs.sort()
files.sort()
db_files=get_folder_contents(db,filename_join(path,"",options)+"/")
if not options.symlinks:
files=clean_syms(files,path)
for file in files:
filename=filename_join(path,file,options)
if file==options.sqlfile:
continue
if not os.path.isfile(filename):
continue
#if not is_listed(db,filename):
if file not in db_files:
if options.add:
add_single(conn,filename,change=False,fullfile=options.fullfile)
else:
if options.changed:
ftime=os.path.getmtime(filename)
if not ftime_match(db,filename,ftime):
#file content changed
add_single(conn,filename,change=True,fullfile=options.fullfile)
conn.commit()
sys.stdout.write("\n")
return
def add_single(conn,filename,change=False,hash=None,minsize=0,fullfile=False):
try:
fsize=os.path.getsize(filename)
hsize=humanize_size(fsize)
except IOError:
hsize=""
print("\r%s (%s)"%(filename,hsize))
db=conn.cursor()
try:
if hash==None:
hash=get_md5(filename,fullfile)
ftime=os.path.getmtime(filename)
mime=MIME.file(filename.encode('UTF-8'))
except IOError:
print("File '%s' not found. Bad link?"%(filename,))
return
except UnicodeDecodeError:
mime="NA"
if change:
db.execute("UPDATE list SET date=?, hash=?, size=?, mime=? \
WHERE file=?",(ftime,hash,fsize,mime,filename))
#print "changing: %(f)s " % {'f':filename}
else:
db.execute("INSERT INTO list(file,date,hash,size,mime)\
VALUES(?,?,?,?,?)",(filename,ftime,hash,fsize,mime))
sys.stdout.write('\r')
return
def checkdb(options):
needle=options.search
if len(needle)==0:
needle.append('%')
needle=['%'+i+'%' for i in needle]
like_query=' OR '.join(['file LIKE ?' for i in needle])
conn=sqlite3.connect(options.sqlfile)
conn.text_factory=str
db=conn.cursor()
db.execute("SELECT file,hash,size,date FROM list WHERE "+like_query+" ORDER BY file",needle)
missing=[]
differing=[]
OK_count=0
for row in db:
status='OK'
sys.stdout.write("\r%s"%(row[0],))
if os.path.exists(row[0]):
md5f=get_md5(row[0],options.fullfile)
if row[1]!=md5f:
status='Checksum-difference'
differing.append(row)
else:
status='Not-found'
missing.append(row)
sys.stdout.write("\r%s %s\n"%(row[0],status))
if status=='OK':
OK_count+=1
if len(differing)>0:
print_stderr("----\nDiffering files:")
pad=str(max([len(x[0]) for x in differing]))
for f in differing:
print_stderr(("%-"+pad+"s (%s %7s => %s %7s)")%(f[0],humanize_date(f[3]),humanize_size(f[2]),
humanize_date(os.path.getmtime(f[0])),
humanize_size(os.path.getsize(f[0]))))
if len(missing)>0:
print_stderr("----\nMissing files:")
pad=str(max([len(x[0]) for x in missing]))
for f in missing:
print_stderr(("%-"+pad+"s (%s %7s)")%(f[0],humanize_date(f[3]),humanize_size(f[2])))
(added,changed)=has_changes_additions(db,options,False)
if len(added)>0:
print_stderr("----\nAdded files:")
pad=str(max([len(x[0]) for x in added]))
for f in added:
print_stderr(("%-"+pad+"s (%s %7s)")%(f,
humanize_date(os.path.getmtime(f)),
humanize_size(os.path.getsize(f))))
print_stderr("----\nFile check summary:")
print_stderr("Database modified: %s"%(humanize_date(os.path.getmtime(options.sqlfile)),))
print_stderr("Checksum matches : %d"%(OK_count,))
print_stderr("Checksum mismatch: %d"%(len(differing),))
print_stderr("Files missing : %d"%(len(missing),))
print_stderr("Files added : %d"%(len(added),))
def clean_dirs(dirs):
for s in dirs[:]:
if (s in BADDIRS) or (s.startswith(".")):
dirs.remove(s)
return dirs
def clean_syms(files,path):
nonsyms=[]
for f in files:
if not os.path.islink(os.path.join(path,f)):
nonsyms.append(f)
return nonsyms
def createdb(options):
conn=sqlite3.connect(options.sqlfile)
db=conn.cursor()
conn.text_factory=str
db.execute('CREATE TABLE list (id INTEGER PRIMARY KEY AUTOINCREMENT,\
file TEXT,date INTEGER, hash TEXT,\
size INTEGER, mime TEXT)')
db.execute('CREATE TABLE config (id INTEGER PRIMARY KEY AUTOINCREMENT,\
object TEXT)')
conn.commit()
config = ConfigParser.RawConfigParser()
config.add_section("General")
config.set("General","Relative",str(options.relative))
config.set("General","FullFile",str(options.fullfile))
store=StringIO.StringIO()
config.write(store)
db.execute("INSERT INTO config (object) values (?)",(store.getvalue(),))
conn.commit()
return
def delete_nonexisting(sqlfile,options):
conn=sqlite3.connect(sqlfile)
conn.text_factory=str
db=conn.cursor()
dbdel=conn.cursor()
db.execute('SELECT file FROM list')
for row in db:
if os.path.exists(row[0]):
delete=False
if not options.symlinks:
if os.path.islink(row[0]):
delete=True
else:
delete=True
if delete:
print('removing.. '+row[0])
dbdel.execute("DELETE FROM list where file == ?",(row[0],))
conn.commit()
return
def disk_used(options):
conn=sqlite3.connect(options.sqlfile)
conn.text_factory=str
db=conn.cursor()
checkpath=filename_join(options.diskused,"",options)+"/"
if checkpath=="./":
checkpath=""
db.execute('SELECT size,replace(file,?,"") as path FROM list WHERE file LIKE ?',
(checkpath,
checkpath+"%",
))
entries=[]
sizes=[]
for row in db:
start_path=row[1].split('/')
start_path="/".join(start_path[0:int(options.diskused_depth)])
if start_path not in entries:
entries.append(start_path)
sizes.append(row[0])
else:
sizes[ entries.index(start_path) ]+=row[0]
for entry in zip(sizes,entries):
print("| ".join([ str(entry[0]).ljust(14),
humanize_size(entry[0]).rjust(8),
entry[1]]))
def filename_join(path,name,options):
filename=os.path.realpath(os.path.join(path,name))
if options.relative:
return os.path.relpath(filename, options.sqlpath)
return filename
def find_duplicates(sqlfile):
conn=sqlite3.connect(sqlfile)
conn.text_factory=str
db=conn.cursor()
dbh=conn.cursor()
db.execute("SELECT hash,count(*) FROM list WHERE size > 0 GROUP BY hash HAVING count(*) > 1 ")
duphash=[]
for row in db:
hash=row[0]
dbh.execute("SELECT file,size,date FROM list WHERE hash = ?",(hash,))
flist=[]
for row in dbh:
flist.append(row)
flist.sort(key=lambda file: file[0])
duphash.append((hash, flist))
duphash.sort(key=lambda file: file[1][0])
return duphash
def ftime_match(db,filename,ftime):
db.execute("SELECT date FROM list where file == ?",(filename,))
count=db.fetchall()
return count[0][0]==ftime
def get_folder_contents(db,path):
''' return the contents of the folder '''
files=[]
if path=="./":
db.execute("SELECT file FROM list where file NOT LIKE ?",('%/%',))
path=""
else:
db.execute("SELECT file FROM list where file LIKE ?",(path+'%',))
for row in db:
try:
base=row[0].decode('utf-8').replace(path,'',1)
except UnicodeDecodeError:
print(row[0]+" is giving me trouble.")
try:
base=row[0].encode('utf-8').replace(path,'',1)
except UnicodeDecodeError:
print(row[0]+" is still giving me trouble.")
sys.exit(1)
if base.find('/')==-1:
files.append(base)
return files
def get_md5(filename,fullfile=False):
''' returns content based hash, only first 50Mb is read, unless user wants the whole file '''
fsize=os.path.getsize(filename)
if fullfile and fsize>DEFAULT_CHUNK:
anim_i=0
anim_len=len(ANIM)
block_size=2**24
percents_per_block=100/(float(fsize)/block_size)
md5 = hashlib.md5()
with open(filename,'rb') as f:
for chunk in iter(lambda: f.read(block_size), b''):
sys.stdout.write('\r %s (%02d%%)'%(ANIM[anim_i%anim_len],int(anim_i*percents_per_block)))
sys.stdout.flush()
anim_i+=1
md5.update(chunk)
sys.stdout.write('\r ')
return md5.hexdigest()
return hashlib.md5(open(filename,'rb').read(DEFAULT_CHUNK)).hexdigest()
def has_changes(options):
conn=sqlite3.connect(options.sqlfile)
conn.text_factory=str
db=conn.cursor()
if options.haschanges:
options.changed=True
if options.hasdeletions or options.haschanges:
has_changes_deleted(db)
if options.hasadditions or options.haschanges:
has_changes_additions(db,options)
def has_changes_deleted(db,exit=True):
db.execute('SELECT file FROM list')
deleted=[]
for row in db:
if not os.path.exists(row[0]):
if exit:
print('True')
sys.exit(1)
else:
deleted.append(row[0])
return deleted
def has_changes_additions(db,options,exit=True):
added=[]
changed=[]
for path,dirs,files in os.walk(options.startpath,followlinks=options.symlinks):
dirs=clean_dirs(dirs)
db_files=get_folder_contents(db,filename_join(path,"",options)+"/")
if not options.symlinks:
files=clean_syms(files,path)
for file in files:
filename=filename_join(path,file,options)
if file==options.sqlfile:
continue
#if not is_listed(db,filename):
if file not in db_files:
if exit:
print('True')
sys.exit(1)
else:
added.append(filename)
else:
if options.changed:
ftime=os.path.getmtime(filename)
if not ftime_match(db,filename,ftime):
#file content changed
if exit:
print('True')
sys.exit(1)
else:
changed.append(filename)
return (added,changed)
#~ def hash_match(db,filename,hash):
#~ db.execute("SELECT hash FROM list where file == ?",(filename,))
#~ count=db.fetchall()
#~ return count[0][0]==hash
def humanize_date(date):
if date==None:
return ''
return datetime.datetime.fromtimestamp(int(date)).strftime('%Y-%m-%d %H:%M:%S')
def humanize_size(size,precision=1):
if size==None:
return 'nan'
suffixes=['B','KB','MB','GB','TB']
suffixIndex = 0
defPrecision=0
while size > 1024:
suffixIndex += 1 #increment the index of the suffix
size = size/1024.0 #apply the division
defPrecision=precision
return "%.*f%s"%(defPrecision,size,suffixes[suffixIndex])
def is_listed(db,filename):
db.execute("SELECT COUNT(*) FROM list where file == ?",(filename,))
count=db.fetchall()
return count[0][0]>0
def matchdb(sqlfile,needle,helper):
needle=needle.lower()
import difflib as dl
conn=sqlite3.connect(sqlfile)
conn.text_factory=str
db=conn.cursor()
if len(helper)>0:
helper=['%'+i+'%' for i in helper]
like_query=' OR '.join(['file LIKE ?' for i in helper])
db.execute("SELECT file FROM list WHERE "+like_query+" ORDER BY date DESC",helper)
else:
db.execute("SELECT file FROM list ORDER BY date DESC")
ratio=0
best_match=""
for row in db:
s=dl.SequenceMatcher(None, os.path.basename(row[0]).lower(), needle)
s_ratio=s.ratio()
if ratio < s_ratio:
ratio=s_ratio
best_match=row[0]
print(best_match)
def print_structure(files):
for hash in files:
#print(hash[0])
i=1
for f in hash[1]:
print "%(i)d: %(x)d:%(f)s " % {'i':i, 'f':f[0], 'x':f[1]}
i+=1
return
def print_stderr(s):
sys.stderr.write(s)
sys.stderr.write("\n")
sys.stderr.flush()
def searchdb(sqlfile,needle):
needle=['%'+i+'%' for i in needle]
like_query=' OR '.join(['file LIKE ?' for i in needle])
conn=sqlite3.connect(sqlfile)
conn.text_factory=str
db=conn.cursor()
db.execute("SELECT file FROM list WHERE "+like_query+" ORDER BY file",needle)
for row in db:
print(row[0])
def stored_options(options):
try:
conn=sqlite3.connect(options.sqlfile)
db=conn.cursor()
conn.text_factory=str
db.execute("SELECT object FROM config")
store=""
for row in db:
store+=row[0]+'\n'
config = ConfigParser.RawConfigParser()
config.readfp(io.BytesIO(store))
options.relative=config.getboolean("General","Relative")
options.fullfile=config.getboolean("General","FullFile")
except:
pass
return options
def main():
options=setup_options();
if not os.path.exists(options.sqlfile):
createdb(options);
options=stored_options(options)
if options.relative:
os.chdir(options.sqlpath)
if options.haschanges or options.hasadditions or options.hasdeletions:
has_changes(options)
sys.exit(0)
if options.check:
checkdb(options)
sys.exit(0)
if len(options.search)>0 and not options.match:
searchdb(options.sqlfile,options.search)
sys.exit(0)
if options.match:
matchdb(options.sqlfile,options.match,options.search)
sys.exit(0)
if options.diskused:
disk_used(options)
sys.exit(0)
if options.delete:
print('Deleting entries...')
delete_nonexisting(options.sqlfile,options)
if options.add or options.changed:
print('Adding '+options.startpath+' entries...')
add_recurse(options)
if options.duplicate:
files=find_duplicates(options.sqlfile)
print_structure(files)
sys.exit(0)
main()