Files
q-tools/files/file_list.py
2021-09-24 19:21:30 +03:00

563 lines
20 KiB
Python
Executable File

#!/usr/bin/env python3
# -*- coding: latin-1 -*-
from builtins import zip
from builtins import str
import sys
import os
import re
import sqlite3
import subprocess
import hashlib
import magic
from argparse import ArgumentParser
import configparser
import io
import datetime
SQLFILE='list_of_files.sqlite'
IMGMATCH=re.compile('.*\.jpg$|.*\.jpeg$|.*\.png$',re.I)
BADDIRS=[]
MINSIZE=0
MIME=magic.open(magic.MAGIC_NONE)
#MIME=magic.open(magic.MAGIC_MIME)
MIME.load()
ANIM=['.','·',"'","'",'·','.','_']
DEFAULT_CHUNK=1024*1024*50
def setup_options():
parser=ArgumentParser(description="Maintains the list of images sqlite file")
parser.add_argument("-a",action="store_false",dest="add",default=True,
help="Do not add new files [%(default)s]")
parser.add_argument("-c",action="store_true",dest="changed",default=False,
help="Modify changed files [%(default)s]")
parser.add_argument("--check",action="store_true",dest="check",default=False,
help="Check md5sums of files. Limit check with -s.")
parser.add_argument("-d",action="store_true",dest="delete",default=False,
help="Delete non-existing entries [%(default)s]")
parser.add_argument("--du",type=str,action='store',dest="diskused",default=False,
help="Print directory sizes. Argument is the path where directories are listed from.")
parser.add_argument("--du-depth",type=str,action='store',dest="diskused_depth",default=1,
help="Depth of summarization for --du.")
parser.add_argument("--dup",action="store_true",dest="duplicate",default=False,
help="Return a list of duplicate files, based on hashes. This option will flip the 'Add new files' option. [%(default)s]")
parser.add_argument("--dup-order",action="store",dest="duplicate_order",default='path',
help = "Order duplicates by a method. (length = path str length)",
choices = ('age','length','file','path')
)
parser.add_argument("--haschanges",action="store_true",dest="haschanges",default=False,
help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.")
parser.add_argument("--hasdeletions",action="store_true",dest="hasdeletions",default=False,
help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.")
parser.add_argument("--hasadditions",action="store_true",dest="hasadditions",default=False,
help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.")
parser.add_argument("-f",action="store",dest="sqlfile",default=SQLFILE,
help="SQL file name to use [%(default)s]")
parser.add_argument("-l",action="store_true",dest="symlinks",default=False,
help="Follow symbolic links [%(default)s]")
parser.add_argument("--match",type=str,dest="match",default=False,
help="Search for closest match from basenames, can be helped with adding -s")
parser.add_argument("-s",type=str,action='append',dest="search",default=[],
help="Search list based on path pattern")
parser.add_argument("-x",action="append",dest="exclude",default=[],
help="Exclude folder name from the lists. This option may be issued several times")
parser.add_argument("--full",action="store_true",dest="fullfile",default=False,
help="ONLY FOR NEW DB CREATION. Use full files to calculate md5 checksum. Defaults to first 50Mb. [%(default)s]")
parser.add_argument("--relative",action="store_true",dest="relative",default=False,
help="ONLY FOR NEW DB CREATION. Store filenames relative to database file.")
parser.add_argument('startpath', action="store",default='.', nargs='?')
options=parser.parse_args()
BADDIRS.extend(options.exclude)
if options.duplicate:
options.add=not options.add
options.sqlpath=os.path.dirname(os.path.realpath(options.sqlfile))
return options
def add_recurse(options):
conn=sqlite3.connect(options.sqlfile)
conn.text_factory=str
db=conn.cursor()
prev_path_len=0
for path,dirs,files in os.walk(options.startpath,followlinks=options.symlinks):
sys.stdout.write(("\r%s%s"%(filename_join(path,".",options),(prev_path_len-len(path))*' ')))
prev_path_len=len(path)
dirs=clean_dirs(dirs)
dirs.sort()
files.sort()
db_files=get_folder_contents(db,filename_join(path,"",options)+"/")
if not options.symlinks:
files=clean_syms(files,path)
for file in files:
filename=filename_join(path,file,options)
if file==options.sqlfile:
continue
if not os.path.isfile(filename):
continue
#if not is_listed(db,filename):
if file not in db_files:
if options.add:
add_single(conn,filename,change=False,fullfile=options.fullfile)
else:
if options.changed:
ftime=os.path.getmtime(filename)
if not ftime_match(db,filename,ftime):
#file content changed
add_single(conn,filename,change=True,fullfile=options.fullfile)
conn.commit()
sys.stdout.write("\n")
return
def add_single(conn,filename,change=False,hash=None,minsize=0,fullfile=False):
try:
fsize=os.path.getsize(filename)
hsize=humanize_size(fsize)
except IOError:
hsize=""
print("\r%s (%s)"%(filename,hsize))
db=conn.cursor()
try:
if hash==None:
hash=get_md5(filename,fullfile)
ftime=os.path.getmtime(filename)
mime=MIME.file(str(filename.encode('UTF-8')))
except IOError:
print("File '%s' not found. Bad link?"%(filename,))
return
except (UnicodeDecodeError, TypeError):
mime="NA"
if change:
db.execute("UPDATE list SET date=?, hash=?, size=?, mime=? \
WHERE file=?",(ftime,hash,fsize,mime,filename))
#print "changing: %(f)s " % {'f':filename}
else:
db.execute("INSERT INTO list(file,date,hash,size,mime)\
VALUES(?,?,?,?,?)",(filename,ftime,hash,fsize,mime))
sys.stdout.write('\r')
return
def checkdb(options):
needle=options.search
if len(needle)==0:
needle.append('%')
needle=['%'+i+'%' for i in needle]
like_query=' OR '.join(['file LIKE ?' for i in needle])
conn=sqlite3.connect(options.sqlfile)
conn.text_factory=str
db=conn.cursor()
db.execute("SELECT file,hash,size,date FROM list WHERE "+like_query+" ORDER BY file",needle)
missing=[]
differing=[]
OK_count=0
for row in db:
status='OK'
sys.stdout.write("\r%s"%(row[0],))
if os.path.exists(row[0]):
md5f=get_md5(row[0],options.fullfile)
if row[1]!=md5f:
status='Checksum-difference'
differing.append(row)
else:
status='Not-found'
missing.append(row)
sys.stdout.write("\r%s %s\n"%(row[0],status))
if status=='OK':
OK_count+=1
if len(differing)>0:
print_stderr("----\nDiffering files:")
pad=str(max([len(x[0]) for x in differing]))
for f in differing:
print(("%-"+pad+"s (%s %7s => %s %7s)")%(f[0],humanize_date(f[3]),humanize_size(f[2]),
humanize_date(os.path.getmtime(f[0])),
humanize_size(os.path.getsize(f[0]))))
if len(missing)>0:
print("----\nMissing files:")
pad=str(max([len(x[0]) for x in missing]))
for f in missing:
print(("%-"+pad+"s (%s %7s)")%(f[0],humanize_date(f[3]),humanize_size(f[2])))
(added,changed)=has_changes_additions(db,options,False)
if len(added)>0:
print("----\nAdded files:")
pad=str(max([len(x[0]) for x in added]))
for f in added:
print(("%-"+pad+"s (%s %7s)")%(f,
humanize_date(os.path.getmtime(f)),
humanize_size(os.path.getsize(f))))
print("----\nFile check summary:")
print("Database modified: %s"%(humanize_date(os.path.getmtime(options.sqlfile)),))
print("Checksum matches : %d"%(OK_count,))
print("Checksum mismatch: %d"%(len(differing),))
print("Files missing : %d"%(len(missing),))
print("Files added : %d"%(len(added),))
def clean_dirs(dirs):
for s in dirs[:]:
if (s in BADDIRS) or (s.startswith(".")):
dirs.remove(s)
return dirs
def clean_syms(files,path):
nonsyms=[]
for f in files:
if not os.path.islink(os.path.join(path,f)):
nonsyms.append(f)
return nonsyms
def createdb(options):
conn=sqlite3.connect(options.sqlfile)
db=conn.cursor()
conn.text_factory=str
db.execute('CREATE TABLE list (id INTEGER PRIMARY KEY AUTOINCREMENT,\
file TEXT,date INTEGER, hash TEXT,\
size INTEGER, mime TEXT)')
db.execute('CREATE TABLE config (id INTEGER PRIMARY KEY AUTOINCREMENT,\
object TEXT)')
conn.commit()
config = configparser.RawConfigParser()
config.add_section("General")
config.set("General","Relative",str(options.relative))
config.set("General","FullFile",str(options.fullfile))
store=io.StringIO()
config.write(store)
db.execute("INSERT INTO config (object) values (?)",(store.getvalue(),))
conn.commit()
return
def delete_nonexisting(sqlfile,options):
conn=sqlite3.connect(sqlfile)
conn.text_factory=str
db=conn.cursor()
dbdel=conn.cursor()
db.execute('SELECT file FROM list')
for row in db:
if os.path.exists(row[0]):
delete=False
if not options.symlinks:
if os.path.islink(row[0]):
delete=True
else:
delete=True
if delete:
print('removing.. '+row[0])
dbdel.execute("DELETE FROM list where file == ?",(row[0],))
conn.commit()
return
def disk_used(options):
conn=sqlite3.connect(options.sqlfile)
conn.text_factory=str
db=conn.cursor()
checkpath=filename_join(options.diskused,"",options)+"/"
if checkpath=="./":
checkpath=""
db.execute('SELECT size,replace(file,?,"") as path FROM list WHERE file LIKE ?',
(checkpath,
checkpath+"%",
))
entries=[]
sizes=[]
for row in db:
start_path=row[1].split('/')
start_path="/".join(start_path[0:int(options.diskused_depth)])
if start_path not in entries:
entries.append(start_path)
sizes.append(row[0])
else:
sizes[ entries.index(start_path) ]+=row[0]
for entry in zip(sizes,entries):
print("| ".join([ str(entry[0]).ljust(14),
humanize_size(entry[0]).rjust(8),
entry[1]]))
def filename_join(path,name,options):
filename=os.path.realpath(os.path.join(path,name))
if options.relative:
return os.path.relpath(filename, options.sqlpath)
return filename
def find_duplicates(sqlfile, order):
conn=sqlite3.connect(sqlfile)
conn.text_factory=str
db=conn.cursor()
dbh=conn.cursor()
db.execute("SELECT hash,count(*) FROM list WHERE size > 0 GROUP BY hash HAVING count(*) > 1 ")
duphash=[]
for row in db:
hash=row[0]
dbh.execute("SELECT file,size,date FROM list WHERE hash = ?",(hash,))
flist=[]
for row in dbh:
flist.append(row)
sort_by_method(flist, order)
duphash.append((hash, flist))
duphash.sort(key=lambda file: file[1][0])
return duphash
def ftime_match(db,filename,ftime):
db.execute("SELECT date FROM list where file == ?",(filename,))
count=db.fetchall()
return count[0][0]==ftime
def get_folder_contents(db,path):
''' return the contents of the folder '''
files=[]
if path=="./":
db.execute("SELECT file FROM list where file NOT LIKE ?",('%/%',))
path=""
else:
db.execute("SELECT file FROM list where file LIKE ?",(path+'%',))
for row in db:
try:
base=row[0].replace(path,'',1)
except UnicodeDecodeError:
print(row[0]+" is giving me trouble.")
try:
base=row[0].encode('utf-8').replace(path,'',1)
except UnicodeDecodeError:
print(row[0]+" is still giving me trouble.")
sys.exit(1)
if base.find('/')==-1:
files.append(base)
return files
def get_md5(filename,fullfile=False):
''' returns content based hash, only first 50Mb is read, unless user wants the whole file '''
fsize=os.path.getsize(filename)
if fullfile and fsize>DEFAULT_CHUNK:
anim_i=0
anim_len=len(ANIM)
block_size=2**24
percents_per_block=int(100/(float(fsize)/block_size))
md5 = hashlib.md5()
with open(filename,'rb') as f:
for chunk in iter(lambda: f.read(block_size), b''):
sys.stderr.write('\r %s (%02d%%)'%(ANIM[anim_i%anim_len],int(anim_i*percents_per_block)))
sys.stderr.flush()
anim_i+=1
md5.update(chunk)
sys.stderr.write('\r ')
return md5.hexdigest()
return hashlib.md5(open(filename,'rb').read(DEFAULT_CHUNK)).hexdigest()
def has_changes(options):
conn=sqlite3.connect(options.sqlfile)
conn.text_factory=str
db=conn.cursor()
if options.haschanges:
options.changed=True
if options.hasdeletions or options.haschanges:
has_changes_deleted(db)
if options.hasadditions or options.haschanges:
has_changes_additions(db,options)
def has_changes_deleted(db,exit=True):
db.execute('SELECT file FROM list')
deleted=[]
for row in db:
if not os.path.exists(row[0]):
if exit:
print('True')
sys.exit(1)
else:
deleted.append(row[0])
return deleted
def has_changes_additions(db,options,exit=True):
added=[]
changed=[]
for path,dirs,files in os.walk(options.startpath,followlinks=options.symlinks):
dirs=clean_dirs(dirs)
db_files=get_folder_contents(db,filename_join(path,"",options)+"/")
if not options.symlinks:
files=clean_syms(files,path)
for file in files:
filename=filename_join(path,file,options)
if file==options.sqlfile:
continue
#if not is_listed(db,filename):
if file not in db_files:
if exit:
print('True')
sys.exit(1)
else:
added.append(filename)
else:
if options.changed:
ftime=os.path.getmtime(filename)
if not ftime_match(db,filename,ftime):
#file content changed
if exit:
print('True')
sys.exit(1)
else:
changed.append(filename)
return (added,changed)
#~ def hash_match(db,filename,hash):
#~ db.execute("SELECT hash FROM list where file == ?",(filename,))
#~ count=db.fetchall()
#~ return count[0][0]==hash
def humanize_date(date):
if date==None:
return ''
return datetime.datetime.fromtimestamp(int(date)).strftime('%Y-%m-%d %H:%M:%S')
def humanize_size(size,precision=1):
if size==None:
return 'nan'
suffixes=['B','KB','MB','GB','TB']
suffixIndex = 0
defPrecision=0
while size > 1024:
suffixIndex += 1 #increment the index of the suffix
size = float(size/1024.0) #apply the division
defPrecision=precision
return "%.*f%s"%(defPrecision,size,suffixes[suffixIndex])
def is_listed(db,filename):
db.execute("SELECT COUNT(*) FROM list where file == ?",(filename,))
count=db.fetchall()
return count[0][0]>0
def matchdb(sqlfile,needle,helper):
needle=needle.lower()
import difflib as dl
conn=sqlite3.connect(sqlfile)
conn.text_factory=str
db=conn.cursor()
if len(helper)>0:
helper=['%'+i+'%' for i in helper]
like_query=' OR '.join(['file LIKE ?' for i in helper])
db.execute("SELECT file FROM list WHERE "+like_query+" ORDER BY date DESC",helper)
else:
db.execute("SELECT file FROM list ORDER BY date DESC")
ratio=0
best_match=""
for row in db:
s=dl.SequenceMatcher(None, os.path.basename(row[0]).lower(), needle)
s_ratio=s.ratio()
if ratio < s_ratio:
ratio=s_ratio
best_match=row[0]
print(best_match)
def print_duplicates(files):
for hash in files:
#print(hash[0])
i=1
for f in hash[1]:
print("%(i)d|%(s)s|%(d)s|%(f)s " % {
'i':i,
'f':f[0],
'd': humanize_date(f[2]),
's': humanize_size(f[1])
})
i+=1
return
def print_stderr(s):
sys.stderr.write(s)
sys.stderr.write("\n")
sys.stderr.flush()
def searchdb(sqlfile,needle):
needle=['%'+i+'%' for i in needle]
like_query=' OR '.join(['file LIKE ?' for i in needle])
conn=sqlite3.connect(sqlfile)
conn.text_factory=str
db=conn.cursor()
db.execute("SELECT file FROM list WHERE "+like_query+" ORDER BY file",needle)
for row in db:
print(row[0])
def sort_by_method(flist, order):
if order == 'path':
flist.sort(key=lambda file: file[0])
if order == 'file':
flist.sort(key=lambda file: os.path.basename(file[0]))
if order == 'age':
flist.sort(key=lambda file: file[2])
if order == 'length':
flist.sort(key=lambda file: len(file[0]))
def stored_options(options):
try:
conn=sqlite3.connect(options.sqlfile)
db=conn.cursor()
conn.text_factory=str
db.execute("SELECT object FROM config")
store=""
for row in db:
store+=row[0]+'\n'
config = configparser.RawConfigParser()
config.readfp(io.BytesIO(store))
options.relative=config.getboolean("General","Relative")
options.fullfile=config.getboolean("General","FullFile")
except:
pass
return options
def main():
options=setup_options();
if not os.path.exists(options.sqlfile):
createdb(options);
options=stored_options(options)
if options.relative:
os.chdir(options.sqlpath)
if options.haschanges or options.hasadditions or options.hasdeletions:
has_changes(options)
sys.exit(0)
if options.check:
checkdb(options)
sys.exit(0)
if len(options.search)>0 and not options.match:
searchdb(options.sqlfile,options.search)
sys.exit(0)
if options.match:
matchdb(options.sqlfile,options.match,options.search)
sys.exit(0)
if options.diskused:
disk_used(options)
sys.exit(0)
if options.delete:
print('Deleting entries...')
delete_nonexisting(options.sqlfile,options)
if options.add or options.changed:
print('Adding '+options.startpath+' entries...')
add_recurse(options)
if options.duplicate:
files=find_duplicates(options.sqlfile, options.duplicate_order)
print_duplicates(files)
sys.exit(0)
main()