Files
q-tools/files/file_list.py

712 lines
21 KiB
Python
Executable File

#!/usr/bin/env python3
# -*- coding: latin-1 -*-
from builtins import zip
from builtins import str
import sys
import os
import re
import sqlite3
import subprocess
import hashlib
# import magic
from argparse import ArgumentParser
import configparser
import io
import datetime
SQLFILE = "list_of_files.sqlite"
IMGMATCH = re.compile(".*\.jpg$|.*\.jpeg$|.*\.png$", re.I)
BADDIRS = []
MINSIZE = 0
# MIME=magic.open(magic.MAGIC_NONE)
##MIME=magic.open(magic.MAGIC_MIME)
# MIME.load()
ANIM = [".", "·", "'", "'", "·", ".", "_"]
DEFAULT_CHUNK = 1024 * 1024 * 50
def setup_options():
parser = ArgumentParser(description="Maintains the list of images sqlite file")
parser.add_argument(
"-a",
action="store_false",
dest="add",
default=True,
help="Do not add new files [%(default)s]",
)
parser.add_argument(
"-c",
action="store_true",
dest="changed",
default=False,
help="Modify changed files [%(default)s]",
)
parser.add_argument(
"--check",
action="store_true",
dest="check",
default=False,
help="Check md5sums of files. Limit check with -s.",
)
parser.add_argument(
"-d",
action="store_true",
dest="delete",
default=False,
help="Delete non-existing entries [%(default)s]",
)
parser.add_argument(
"--du",
type=str,
action="store",
dest="diskused",
default=False,
help="Print directory sizes. Argument is the path where directories are listed from.",
)
parser.add_argument(
"--du-depth",
type=str,
action="store",
dest="diskused_depth",
default=1,
help="Depth of summarization for --du.",
)
parser.add_argument(
"--dup",
action="store_true",
dest="duplicate",
default=False,
help="Return a list of duplicate files, based on hashes. This option will flip the 'Add new files' option. [%(default)s]",
)
parser.add_argument(
"--dup-order",
action="store",
dest="duplicate_order",
default="path",
help="Order duplicates by a method. (length = path str length)",
choices=("age", "length", "file", "path"),
)
parser.add_argument(
"--haschanges",
action="store_true",
dest="haschanges",
default=False,
help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.",
)
parser.add_argument(
"--hasdeletions",
action="store_true",
dest="hasdeletions",
default=False,
help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.",
)
parser.add_argument(
"--hasadditions",
action="store_true",
dest="hasadditions",
default=False,
help="Do not change anything, return True and exit code 1 if DB needs update. Exit code 0 if all intact.",
)
parser.add_argument(
"-f",
action="store",
dest="sqlfile",
default=SQLFILE,
help="SQL file name to use [%(default)s]",
)
parser.add_argument(
"-l",
action="store_true",
dest="symlinks",
default=False,
help="Follow symbolic links [%(default)s]",
)
parser.add_argument(
"--match",
type=str,
dest="match",
default=False,
help="Search for closest match from basenames, can be helped with adding -s",
)
parser.add_argument(
"-s",
type=str,
action="append",
dest="search",
default=[],
help="Search list based on path pattern",
)
parser.add_argument(
"-x",
action="append",
dest="exclude",
default=[],
help="Exclude folder name from the lists. This option may be issued several times",
)
parser.add_argument(
"--full",
action="store_true",
dest="fullfile",
default=False,
help="ONLY FOR NEW DB CREATION. Use full files to calculate md5 checksum. Defaults to first 50Mb. [%(default)s]",
)
parser.add_argument(
"--relative",
action="store_true",
dest="relative",
default=False,
help="ONLY FOR NEW DB CREATION. Store filenames relative to database file.",
)
parser.add_argument("startpath", action="store", default=".", nargs="?")
options = parser.parse_args()
BADDIRS.extend(options.exclude)
if options.duplicate:
options.add = not options.add
options.sqlpath = os.path.dirname(os.path.realpath(options.sqlfile))
return options
def add_recurse(options):
conn = sqlite3.connect(options.sqlfile)
conn.text_factory = str
db = conn.cursor()
prev_path_len = 0
for path, dirs, files in os.walk(options.startpath, followlinks=options.symlinks):
sys.stdout.write(
(
"\r%s%s"
% (filename_join(path, ".", options), (prev_path_len - len(path)) * " ")
)
)
prev_path_len = len(path)
dirs = clean_dirs(dirs)
dirs.sort()
files.sort()
db_files = get_folder_contents(db, filename_join(path, "", options) + "/")
if not options.symlinks:
files = clean_syms(files, path)
for file in files:
filename = filename_join(path, file, options)
if file == options.sqlfile:
continue
if not os.path.isfile(filename):
continue
# if not is_listed(db,filename):
if file not in db_files:
if options.add:
add_single(conn, filename, change=False, fullfile=options.fullfile)
else:
if options.changed:
ftime = os.path.getmtime(filename)
if not ftime_match(db, filename, ftime):
# file content changed
add_single(
conn, filename, change=True, fullfile=options.fullfile
)
conn.commit()
sys.stdout.write("\n")
return
def add_single(conn, filename, change=False, hash=None, minsize=0, fullfile=False):
try:
fsize = os.path.getsize(filename)
hsize = humanize_size(fsize)
except IOError:
hsize = ""
print("\r%s (%s)" % (filename, hsize))
db = conn.cursor()
try:
if hash == None:
hash = get_md5(filename, fullfile)
ftime = os.path.getmtime(filename)
except IOError:
print("File '%s' not found. Bad link?" % (filename,))
return
if change:
db.execute(
"UPDATE list SET date=?, hash=?, size=?, \
WHERE file=?",
(ftime, hash, fsize, filename),
)
# print "changing: %(f)s " % {'f':filename}
else:
db.execute(
"INSERT INTO list(file,date,hash,size)\
VALUES(?,?,?,?)",
(filename, ftime, hash, fsize),
)
sys.stdout.write("\r")
return
def checkdb(options):
needle = options.search
if len(needle) == 0:
needle.append("%")
needle = ["%" + i + "%" for i in needle]
like_query = " OR ".join(["file LIKE ?" for i in needle])
conn = sqlite3.connect(options.sqlfile)
conn.text_factory = str
db = conn.cursor()
db.execute(
"SELECT file,hash,size,date FROM list WHERE " + like_query + " ORDER BY file",
needle,
)
missing = []
differing = []
OK_count = 0
for row in db:
status = "OK"
sys.stdout.write("\r%s" % (row[0],))
if os.path.exists(row[0]):
md5f = get_md5(row[0], options.fullfile)
if row[1] != md5f:
status = "Checksum-difference"
differing.append(row)
else:
status = "Not-found"
missing.append(row)
sys.stdout.write("\r%s %s\n" % (row[0], status))
if status == "OK":
OK_count += 1
if len(differing) > 0:
print_stderr("----\nDiffering files:")
pad = str(max([len(x[0]) for x in differing]))
for f in differing:
print(
("%-" + pad + "s (%s %7s => %s %7s)")
% (
f[0],
humanize_date(f[3]),
humanize_size(f[2]),
humanize_date(os.path.getmtime(f[0])),
humanize_size(os.path.getsize(f[0])),
)
)
if len(missing) > 0:
print("----\nMissing files:")
pad = str(max([len(x[0]) for x in missing]))
for f in missing:
print(
("%-" + pad + "s (%s %7s)")
% (f[0], humanize_date(f[3]), humanize_size(f[2]))
)
(added, changed) = has_changes_additions(db, options, False)
if len(added) > 0:
print("----\nAdded files:")
pad = str(max([len(x[0]) for x in added]))
for f in added:
print(
("%-" + pad + "s (%s %7s)")
% (
f,
humanize_date(os.path.getmtime(f)),
humanize_size(os.path.getsize(f)),
)
)
print("----\nFile check summary:")
print("Database modified: %s" % (humanize_date(os.path.getmtime(options.sqlfile)),))
print("Checksum matches : %d" % (OK_count,))
print("Checksum mismatch: %d" % (len(differing),))
print("Files missing : %d" % (len(missing),))
print("Files added : %d" % (len(added),))
def clean_dirs(dirs):
for s in dirs[:]:
if (s in BADDIRS) or (s.startswith(".")):
dirs.remove(s)
return dirs
def clean_syms(files, path):
nonsyms = []
for f in files:
if not os.path.islink(os.path.join(path, f)):
nonsyms.append(f)
return nonsyms
def createdb(options):
conn = sqlite3.connect(options.sqlfile)
db = conn.cursor()
conn.text_factory = str
db.execute(
"CREATE TABLE list (id INTEGER PRIMARY KEY AUTOINCREMENT,\
file TEXT,date INTEGER, hash TEXT,\
size INTEGER, mime TEXT)"
)
db.execute(
"CREATE TABLE config (id INTEGER PRIMARY KEY AUTOINCREMENT,\
object TEXT)"
)
conn.commit()
config = configparser.RawConfigParser()
config.add_section("General")
config.set("General", "Relative", str(options.relative))
config.set("General", "FullFile", str(options.fullfile))
store = io.StringIO()
config.write(store)
db.execute("INSERT INTO config (object) values (?)", (store.getvalue(),))
conn.commit()
return
def delete_nonexisting(sqlfile, options):
conn = sqlite3.connect(sqlfile)
conn.text_factory = str
db = conn.cursor()
dbdel = conn.cursor()
db.execute("SELECT file FROM list")
for row in db:
if os.path.exists(row[0]):
delete = False
if not options.symlinks:
if os.path.islink(row[0]):
delete = True
else:
delete = True
if delete:
print("removing.. " + row[0])
dbdel.execute("DELETE FROM list where file == ?", (row[0],))
conn.commit()
return
def disk_used(options):
conn = sqlite3.connect(options.sqlfile)
conn.text_factory = str
db = conn.cursor()
checkpath = filename_join(options.diskused, "", options) + "/"
if checkpath == "./":
checkpath = ""
db.execute(
'SELECT size,replace(file,?,"") as path FROM list WHERE file LIKE ?',
(
checkpath,
checkpath + "%",
),
)
entries = []
sizes = []
for row in db:
start_path = row[1].split("/")
start_path = "/".join(start_path[0 : int(options.diskused_depth)])
if start_path not in entries:
entries.append(start_path)
sizes.append(row[0])
else:
sizes[entries.index(start_path)] += row[0]
for entry in zip(sizes, entries):
print(
"| ".join(
[str(entry[0]).ljust(14), humanize_size(entry[0]).rjust(8), entry[1]]
)
)
def filename_join(path, name, options):
filename = os.path.realpath(os.path.join(path, name))
if options.relative:
return os.path.relpath(filename, options.sqlpath)
return filename
def find_duplicates(sqlfile, order):
conn = sqlite3.connect(sqlfile)
conn.text_factory = str
db = conn.cursor()
dbh = conn.cursor()
db.execute(
"SELECT hash,count(*) FROM list WHERE size > 0 GROUP BY hash HAVING count(*) > 1 "
)
duphash = []
for row in db:
hash = row[0]
dbh.execute("SELECT file,size,date FROM list WHERE hash = ?", (hash,))
flist = []
for row in dbh:
flist.append(row)
sort_by_method(flist, order)
duphash.append((hash, flist))
duphash.sort(key=lambda file: file[1][0])
return duphash
def ftime_match(db, filename, ftime):
db.execute("SELECT date FROM list where file == ?", (filename,))
count = db.fetchall()
return count[0][0] == ftime
def get_folder_contents(db, path):
"""return the contents of the folder"""
files = []
if path == "./":
db.execute("SELECT file FROM list where file NOT LIKE ?", ("%/%",))
path = ""
else:
db.execute("SELECT file FROM list where file LIKE ?", (path + "%",))
for row in db:
try:
base = row[0].replace(path, "", 1)
except UnicodeDecodeError:
print(row[0] + " is giving me trouble.")
try:
base = row[0].encode("utf-8").replace(path, "", 1)
except UnicodeDecodeError:
print(row[0] + " is still giving me trouble.")
sys.exit(1)
if base.find("/") == -1:
files.append(base)
return files
def get_md5(filename, fullfile=False):
"""returns content based hash, only first 50Mb is read, unless user wants the whole file"""
fsize = os.path.getsize(filename)
if fullfile and fsize > DEFAULT_CHUNK:
anim_i = 0
anim_len = len(ANIM)
block_size = 2**24
percents_per_block = int(100 / (float(fsize) / block_size))
md5 = hashlib.md5()
with open(filename, "rb") as f:
for chunk in iter(lambda: f.read(block_size), b""):
sys.stderr.write(
"\r %s (%02d%%)"
% (ANIM[anim_i % anim_len], int(anim_i * percents_per_block))
)
sys.stderr.flush()
anim_i += 1
md5.update(chunk)
sys.stderr.write("\r ")
return md5.hexdigest()
return hashlib.md5(open(filename, "rb").read(DEFAULT_CHUNK)).hexdigest()
def has_changes(options):
conn = sqlite3.connect(options.sqlfile)
conn.text_factory = str
db = conn.cursor()
if options.haschanges:
options.changed = True
if options.hasdeletions or options.haschanges:
has_changes_deleted(db)
if options.hasadditions or options.haschanges:
has_changes_additions(db, options)
def has_changes_deleted(db, exit=True):
db.execute("SELECT file FROM list")
deleted = []
for row in db:
if not os.path.exists(row[0]):
if exit:
print("True")
sys.exit(1)
else:
deleted.append(row[0])
return deleted
def has_changes_additions(db, options, exit=True):
added = []
changed = []
for path, dirs, files in os.walk(options.startpath, followlinks=options.symlinks):
dirs = clean_dirs(dirs)
db_files = get_folder_contents(db, filename_join(path, "", options) + "/")
if not options.symlinks:
files = clean_syms(files, path)
for file in files:
filename = filename_join(path, file, options)
if file == options.sqlfile:
continue
# if not is_listed(db,filename):
if file not in db_files:
if exit:
print("True")
sys.exit(1)
else:
added.append(filename)
else:
if options.changed:
ftime = os.path.getmtime(filename)
if not ftime_match(db, filename, ftime):
# file content changed
if exit:
print("True")
sys.exit(1)
else:
changed.append(filename)
return (added, changed)
# ~ def hash_match(db,filename,hash):
# ~ db.execute("SELECT hash FROM list where file == ?",(filename,))
# ~ count=db.fetchall()
# ~ return count[0][0]==hash
def humanize_date(date):
if date == None:
return ""
return datetime.datetime.fromtimestamp(int(date)).strftime("%Y-%m-%d %H:%M:%S")
def humanize_size(size, precision=1):
if size == None:
return "nan"
suffixes = ["B", "KB", "MB", "GB", "TB"]
suffixIndex = 0
defPrecision = 0
while size > 1024:
suffixIndex += 1 # increment the index of the suffix
size = float(size / 1024.0) # apply the division
defPrecision = precision
return "%.*f%s" % (defPrecision, size, suffixes[suffixIndex])
def is_listed(db, filename):
db.execute("SELECT COUNT(*) FROM list where file == ?", (filename,))
count = db.fetchall()
return count[0][0] > 0
def matchdb(sqlfile, needle, helper):
needle = needle.lower()
import difflib as dl
conn = sqlite3.connect(sqlfile)
conn.text_factory = str
db = conn.cursor()
if len(helper) > 0:
helper = ["%" + i + "%" for i in helper]
like_query = " OR ".join(["file LIKE ?" for i in helper])
db.execute(
"SELECT file FROM list WHERE " + like_query + " ORDER BY date DESC", helper
)
else:
db.execute("SELECT file FROM list ORDER BY date DESC")
ratio = 0
best_match = ""
for row in db:
s = dl.SequenceMatcher(None, os.path.basename(row[0]).lower(), needle)
s_ratio = s.ratio()
if ratio < s_ratio:
ratio = s_ratio
best_match = row[0]
print(best_match)
def print_duplicates(files):
for hash in files:
# print(hash[0])
i = 1
for f in hash[1]:
print(
"%(i)d|%(s)s|%(d)s|%(f)s "
% {
"i": i,
"f": f[0],
"d": humanize_date(f[2]),
"s": humanize_size(f[1]),
}
)
i += 1
return
def print_stderr(s):
sys.stderr.write(s)
sys.stderr.write("\n")
sys.stderr.flush()
def searchdb(sqlfile, needle):
needle = ["%" + i + "%" for i in needle]
like_query = " OR ".join(["file LIKE ?" for i in needle])
conn = sqlite3.connect(sqlfile)
conn.text_factory = str
db = conn.cursor()
db.execute("SELECT file FROM list WHERE " + like_query + " ORDER BY file", needle)
for row in db:
print(row[0])
def sort_by_method(flist, order):
if order == "path":
flist.sort(key=lambda file: file[0])
if order == "file":
flist.sort(key=lambda file: os.path.basename(file[0]))
if order == "age":
flist.sort(key=lambda file: file[2])
if order == "length":
flist.sort(key=lambda file: len(file[0]))
def stored_options(options):
try:
conn = sqlite3.connect(options.sqlfile)
db = conn.cursor()
conn.text_factory = str
db.execute("SELECT object FROM config")
store = ""
for row in db:
store += row[0] + "\n"
config = configparser.RawConfigParser()
config.read_file(io.StringIO(store))
options.relative = config.getboolean("General", "Relative")
options.fullfile = config.getboolean("General", "FullFile")
except Exception as e:
pass
return options
def main():
options = setup_options()
if not os.path.exists(options.sqlfile):
createdb(options)
options = stored_options(options)
if options.relative:
os.chdir(options.sqlpath)
if options.haschanges or options.hasadditions or options.hasdeletions:
has_changes(options)
sys.exit(0)
if options.check:
checkdb(options)
sys.exit(0)
if len(options.search) > 0 and not options.match:
searchdb(options.sqlfile, options.search)
sys.exit(0)
if options.match:
matchdb(options.sqlfile, options.match, options.search)
sys.exit(0)
if options.diskused:
disk_used(options)
sys.exit(0)
if options.delete:
print("Deleting entries...")
delete_nonexisting(options.sqlfile, options)
if options.add or options.changed:
print("Adding " + options.startpath + " entries...")
add_recurse(options)
if options.duplicate:
files = find_duplicates(options.sqlfile, options.duplicate_order)
print_duplicates(files)
sys.exit(0)
main()