new imagelist implementation

This commit is contained in:
Ville Rantanen
2024-04-07 17:28:15 +03:00
parent aaf6044e83
commit 31ab87b9df
5 changed files with 1285 additions and 0 deletions

View File

@@ -0,0 +1,776 @@
import os
import re
import sys
import traceback
from argparse import ArgumentParser
from datetime import datetime
from imagelist2.db import DB, sqlite_sqrt, sqlite_square
from imagelist2.image import ImageMeasure, is_image_extension
from tqdm import tqdm
__version__ = "0.0.1"
SQLFILE = "image-list.sqlite"
# IMGMATCH = re.compile("|".join([".*\." + x + "$" |.*\.jpeg$|.*\.png$|.*\.gif$|.*\.tif$", re.I)
BADDIRS = ["_tn", "_med", ".tn", ".med"]
MINSIZE = 0
class ImageList:
def __init__(self, opts):
self.options = opts
self.db = DB(self.options.sqlfile)
self.root_path = os.path.dirname(os.path.realpath(self.options.sqlfile))
def recursive_add(self):
dir_count = 0
for path, dirs, files in os.walk(os.path.realpath(self.options.startpath), followlinks=self.options.symlinks):
clean_dirs(dirs)
dir_count += 1
progress = tqdm(
total=dir_count,
desc="Directories",
position=0,
delay=1,
leave=False,
)
for path, dirs, files in os.walk(os.path.realpath(self.options.startpath), followlinks=self.options.symlinks):
progress.update()
progress.write(self.db.file2relative(path))
clean_dirs(dirs)
files = [os.path.realpath(os.path.join(path, f)) for f in files]
if not self.options.symlinks:
files = clean_syms(files)
files.sort()
dirs.sort()
db_files = self.db.get_folder_contents(path + "/")
for file in tqdm(files, desc="Files", delay=1, position=1, leave=False):
if not is_image_extension(file):
continue
image = ImageMeasure(file)
if file in db_files:
if self.options.changed:
has_changed = False
if self.db.is_time_mismatch(image):
has_changed = True
if not has_changed:
if self.db.is_hash_mismatch(image):
has_changed = True
if has_changed:
self.add_single(image, change=True)
else:
if not self.options.no_add:
self.add_single(image, change=False)
self.db.conn.commit()
return
def add_single(self, image, change=False):
if change:
query = "UPDATE list SET hash=?, date=? ,size=? WHERE file=?"
error_msg = f"error adding file: {image.filename}"
else:
query = "INSERT INTO list(hash,date,size,file) VALUES (?,?,?,?)"
error_msg = f"error changing file: {image.filename}"
try:
self.db.cursor().execute(
query,
(
image.get_hash(),
image.get_time(),
image.get_size(),
image.filename,
),
)
except:
print(error_msg)
traceback.print_exc(file=sys.stdout)
sys.exit(1)
def base_add(self):
seen_hash = set()
missing_base = (
self.db.cursor()
.execute(
"""
SELECT list.hash, list.file
FROM list
LEFT JOIN data ON data.hash = list.hash
WHERE data.hash IS NULL
"""
)
.fetchall()
)
if len(missing_base) == 0:
return
cursor = self.db.cursor()
for i, row in enumerate(tqdm(missing_base, desc="Base info", delay=1)):
if row[0] in seen_hash:
continue
seen_hash.add(row[0])
filename = row[1]
if filename == None:
continue
image = ImageMeasure(filename)
cursor.execute(
"""INSERT INTO data(hash,portrait,width,height)
VALUES(?,?,?,?)""",
(
row[0],
image.get_portrait(),
image.get_width(),
image.get_height(),
),
)
if i % 50 == 0:
self.db.conn.commit()
self.db.conn.commit()
return
def delete_missing(self):
to_delete = []
for row in tqdm(self.db.cursor().execute("SELECT file FROM list").fetchall(), delay=1, desc="Clean files"):
if not os.path.exists(row[0]):
to_delete.append(row[0])
cursor = self.db.cursor()
for file in tqdm(to_delete, desc="Cleaning", delay=1):
cursor.execute("DELETE FROM list where file == ?", (file,))
self.db.conn.commit()
if len(to_delete) > 0:
print(f"Cleaned {len(to_delete)} files")
return
def clean_data(self):
to_delete = []
for row in self.db.cursor().execute(
"""SELECT data.hash FROM data LEFT JOIN list ON list.hash = data.hash WHERE list.hash IS NULL"""
):
to_delete.append(row[0])
if len(to_delete) > 0:
cursor = self.db.cursor()
for row in to_delete:
cursor.execute("DELETE FROM data where hash = ?", (row,))
self.db.conn.commit()
print(f"Cleaned {len(to_delete)} metadata")
to_delete = []
for row in self.db.cursor().execute(
"""SELECT tags.hash FROM tags LEFT JOIN list ON list.hash = tags.hash WHERE list.hash IS NULL"""
):
to_delete.append(row[0])
if len(to_delete) > 0:
cursor = self.db.cursor()
for row in to_delete:
cursor.execute("DELETE FROM tags where hash = ?", (row,))
self.db.conn.commit()
print(f"Cleaned {len(to_delete)} tags")
def measure(self):
duplicates = set()
missing_measurements = (
self.db.cursor()
.execute(
"""
SELECT
list.file,
data.hash,
data.fingerprint,
data.sharpness,
data.R,
data.G,
data.B,
data.BR,
data.BG,
data.BB
FROM data
LEFT JOIN list ON data.hash = list.hash
WHERE data.fingerprint IS NULL
OR data.sharpness IS NULL
OR data.R IS NULL
"""
)
.fetchall()
)
if len(missing_measurements) == 0:
return
cursor = self.db.cursor()
for i, row in enumerate(tqdm(missing_measurements, desc="Measure", delay=1, smoothing=0.01)):
filename = row[0]
if filename == None:
continue
if row[1] in duplicates:
continue
duplicates.add(row[1])
image = ImageMeasure(filename)
image.hash = row[1]
image.fingerprint = row[2]
image.sharpness = row[3]
image.colors["R"] = row[4]
image.colors["G"] = row[5]
image.colors["B"] = row[6]
image.colors["BR"] = row[7]
image.colors["BG"] = row[8]
image.colors["BB"] = row[9]
# Calculate if required
image.fingerprint = image.get_fingerprint()
image.sharpness = image.get_sharpness()
image.colors.update(image.get_colors())
cursor.execute(
"""UPDATE data SET
fingerprint = ?,
sharpness = ?,
R = ?,
G = ?,
B = ?,
BR = ?,
BG = ?,
BB = ?
WHERE hash = ?
""",
(
image.fingerprint,
image.sharpness,
image.colors["R"],
image.colors["G"],
image.colors["B"],
image.colors["BR"],
image.colors["BG"],
image.colors["BB"],
image.hash,
),
)
if i % 50 == 0:
self.db.conn.commit()
self.db.conn.commit()
return
def disk_used(self):
if self.options.diskused_depth is None:
self.options.diskused_depth = 9999999999
searchpath = os.path.realpath(self.options.path)
# self.options.diskused_depth += 1
result = self.db.cursor().execute(
"SELECT size, REPLACE(file,?,'') as path FROM list WHERE file LIKE ?",
(
searchpath + "/",
searchpath + "%",
),
)
entries = []
sizes = []
for row in result:
start_path = row[1].split("/")
start_path = "/".join(start_path[0 : int(self.options.diskused_depth)])
if len(start_path) != len(row[1]):
start_path += "/"
if start_path not in entries:
entries.append(start_path)
sizes.append(row[0])
else:
sizes[entries.index(start_path)] += row[0]
for entry in zip(sizes, entries):
print("| ".join([str(entry[0]).ljust(14), humanize_size(entry[0]).rjust(8), entry[1]]))
def duplicates(self):
result = self.db.cursor().execute(
"""
WITH
duplicates AS (SELECT hash FROM list GROUP BY hash HAVING count(hash) > 1),
f AS (SELECT
list.hash,list.file FROM list
LEFT JOIN duplicates ON (list.hash = duplicates.hash)
WHERE duplicates.hash IS NOT NULL
ORDER BY file
)
SELECT
CAST((row_number() OVER (PARTITION BY f.hash))-1 AS TEXT) AS row,
RELATIVE(file)
FROM f
""",
)
for row in result:
c = "=" if row[0] == "0" else ">"
print(c + "|".join(row))
def nearestcolor(self):
"""Find closest matching images to given RGB color"""
src = self.options.nearestcolor
try:
src = [int(i) for i in src.strip().strip('"').split(",")]
if len(src) == 3:
src.append(1)
f = ""
except ValueError:
src = src.strip().split(",")
if len(src) == 1:
limit = 1
else:
limit = int(src[1])
image = ImageMeasure(src[0])
colors = image.get_colors()
f = os.path.realpath(src[0])
src = (colors["R"], colors["G"], colors["B"], limit)
self.db.conn.create_function("SQUARE", 1, sqlite_square)
self.db.conn.create_function("SQRT", 1, sqlite_sqrt)
result = self.db.cursor().execute(
"""
WITH distances AS (
SELECT hash, ROUND(SQRT(SQUARE(BR-?)+SQUARE(BG-?)+SQUARE(BB-?)),1) as distance,BR,BG,BB FROM data ORDER BY distance LIMIT ?
)
SELECT
RELATIVE(list.file),
distances.distance,
distances.BR,
distances.BG,
distances.BB
FROM list
LEFT JOIN
distances ON (distances.hash = list.hash)
WHERE distances.hash IS NOT NULL AND list.file != ?
ORDER BY distances.distance
""",
(src[0], src[1], src[2], src[3], f),
)
print("|".join(("Path", "Dist", "BR", "BG", "BB")))
for hit in result:
p, d, r, g, b = hit
print(
"|".join(
(
p,
str(d),
str(int(r)),
str(int(g)),
str(int(b)),
)
)
)
def similarity(self):
def set_image(row):
image = ImageMeasure(None)
image.hash = row[0]
image.fingerprint = row[1]
image.sharpness = row[2]
image.width = row[3]
image.height = row[4]
image.colors["R"] = row[5]
image.colors["G"] = row[6]
image.colors["B"] = row[7]
return image
def get_matching(cmp_image):
compare_list = self.db.cursor().execute(
"""SELECT hash,fingerprint,sharpness,width,height,R,G,B
FROM data
WHERE fingerprint IS NOT NULL AND sharpness > 0 AND hash != ?""",
(cmp_image.hash,),
)
match_list = []
for row2 in compare_list:
other_image = set_image(row2)
similarity = cmp_image.similarity_difference(other_image)
if similarity <= thr:
other_image.similarity["distance"] = similarity
other_image.similarity["color"] = cmp_image.color_difference(other_image)
other_image.similarity["aspect"] = cmp_image.shape_difference(other_image)
other_image.filename = self.db.hash2file(other_image.hash)
match_list.append(other_image)
return match_list
def get_duplicates():
return self.db.cursor().execute(
"""
WITH
duplicates AS (SELECT fingerprint FROM data GROUP BY fingerprint HAVING count(fingerprint) > 1 AND sharpness > 0),
duphash AS (
SELECT duplicates.fingerprint, data.hash, data.sharpness, data.width, data.height, data.R, data.G, data.B
FROM duplicates
LEFT JOIN data ON (duplicates.fingerprint = data.fingerprint)
),
f AS (SELECT
duphash.fingerprint, duphash.hash,list.file,
duphash.sharpness,
duphash.width, duphash.height,
duphash.R, duphash.G, duphash.B
FROM duphash
LEFT JOIN list ON (list.hash = duphash.hash)
WHERE list.file IS NOT NULL
ORDER BY list.file
)
SELECT
CAST((row_number() OVER (PARTITION BY f.fingerprint))-1 AS TEXT) AS row,
file,
hash,
fingerprint,
sharpness,width,height,R,G,B
FROM f
""",
)
def print_matching(match_list, cmp_image):
if len(match_list) > 0:
match_list.sort(key=lambda i: i.similarity["distance"])
print_similarity_row(cmp_image, "=", 0)
for i, img in enumerate(match_list):
print_similarity_row(img, ">", i + 1)
def print_similarity_row(img, c, index):
fnames = ", ".join([self.db.file2relative(f) for f in img.filename])
print(
f"{c}{index}|{fnames}|{img.similarity['distance']}|{img.similarity['color']}|{img.similarity['aspect']}|{img.sharpness}|{img.width}|{img.height}"
)
print("|".join(("#", "File", "SD", "CD", "RD", "Shp", "W", "H")))
if self.options.similarity:
try:
thr = int(self.options.similarity)
file = None
except ValueError:
file = self.options.similarity.split(",")
if len(file) == 1:
thr = 20
else:
thr = int(file[1])
file = file[0]
if file is None:
# Measure similarity on all files
fingerprint_list = self.db.cursor().execute(
"""
SELECT hash,fingerprint,sharpness,width,height,R,G,B
FROM data
WHERE fingerprint IS NOT NULL
AND sharpness > 0"""
)
checked = set()
for i, row in enumerate(fingerprint_list):
if row[0] in checked:
continue
cmp_image = set_image(row)
cmp_image.filename = self.db.hash2file(cmp_image.hash)
match_list = get_matching(cmp_image)
for m in match_list:
checked.add(m.hash)
print_matching(match_list, cmp_image)
else:
# Read single image, and find similarty to that
cmp_image = ImageMeasure(file)
cmp_image.set_all()
cmp_image.filename = [cmp_image.filename]
match_list = get_matching(cmp_image)
print_matching(match_list, cmp_image)
if self.options.visual_duplicate:
match_list = []
for row in get_duplicates():
if row[0] == "0":
if len(match_list) > 0:
print_matching(match_list, cmp_image)
cmp_image = set_image(row[2:])
cmp_image.filename = [row[1]]
match_list = []
else:
other_image = set_image(row[2:])
other_image.filename = [row[1]]
other_image.similarity["color"] = cmp_image.color_difference(other_image)
other_image.similarity["aspect"] = cmp_image.shape_difference(other_image)
match_list.append(other_image)
print_matching(match_list, cmp_image)
def tag_manage(self):
fname = os.path.realpath(self.options.file)
hash = self.db.file2hash(fname)
if hash is None:
raise Exception(f"{fname} not in database")
for add in self.options.add_tag:
self.db.cursor().execute(
"INSERT INTO tags(hash,tag) VALUES (?,?)",
(hash, add),
)
for rm in self.options.delete_tag:
self.db.cursor().execute(
"DELETE FROM tags WHERE hash = ? AND tag = ?",
(hash, rm),
)
if len(self.options.add_tag) + len(self.options.delete_tag) > 0:
self.db.conn.commit()
tags = self.db.cursor().execute(
"""
SELECT
tags.tag
FROM tags
WHERE tags.hash = ?
""",
(hash,),
)
print(",".join([x[0] for x in tags]))
def clean_dirs(dirs):
"""Remove in place, because os.walk uses the same variable"""
remove = []
for i, s in enumerate(dirs):
if (s in BADDIRS) or s.startswith("."):
remove.append(i)
for r in sorted(remove, reverse=True):
del dirs[r]
def clean_syms(files):
return [f for f in files if not os.path.islink(f)]
def humanize_size(size, precision=1):
if size == None:
return "nan"
suffixes = ["B", "KB", "MB", "GB", "TB", "PB", "EB"]
suffixIndex = 0
defPrecision = 0
while size > 1024:
suffixIndex += 1
size = size / 1024.0
defPrecision = precision
return "%.*f%s" % (defPrecision, size, suffixes[suffixIndex])
def humanize_date(date):
if date == None:
return ""
return datetime.fromtimestamp(int(date)).strftime("%Y-%m-%d %H:%M:%S")
def setup_options():
parser = ArgumentParser(description="Maintains a list of images sqlite file")
parser.add_argument(
"-f",
action="store",
dest="sqlfile",
default=SQLFILE,
help="SQL file name to use [%(default)s]",
)
subparsers = parser.add_subparsers(title="Command", dest="command")
help = subparsers.add_parser("help", help="Help on all commands")
db = subparsers.add_parser("db", help="Update database")
search = subparsers.add_parser("search", help="Search similarity")
du = subparsers.add_parser("du", help="Disk usage")
tag = subparsers.add_parser("tag", help="Tag manager")
db.add_argument(
"--no-add",
"-a",
action="store_true",
dest="no_add",
default=False,
help="Do not add new files [%(default)s]",
)
db.add_argument(
"--measure",
"-m",
action="store_true",
dest="measure",
default=False,
help="Measure various statistics for similarity/color searches. [%(default)s]",
)
db.add_argument(
"--changed",
"-c",
action="store_true",
dest="changed",
default=False,
help="Search for changed files and update their entries [%(default)s]",
)
db.add_argument(
"--no-delete",
"-d",
action="store_true",
dest="no_delete",
default=False,
help="Do not delete non-existing entries [%(default)s]",
)
db.add_argument(
"--no-delete-data",
"-D",
action="store_true",
dest="no_delete_data",
default=False,
help="Do not delete unused metadata [%(default)s]",
)
db.add_argument(
"-x",
action="append",
dest="exclude",
default=[],
help="Exclude folder name. This option may be issued several times.",
)
db.add_argument(
"-l",
action="store_true",
dest="symlinks",
default=False,
help="Follow symbolic links [%(default)s]",
)
db.add_argument("startpath", action="store", default=".", nargs="?", help="Path to start scanning for images.")
du.add_argument(
"-d",
type=int,
action="store",
dest="diskused_depth",
default=None,
help="Depth of summarization for du.",
)
du.add_argument(
type=str,
action="store",
dest="path",
default=".",
help="Print directory sizes. Argument is the path where directories are listed from.",
nargs="?",
)
search.add_argument(
"--dup",
action="store_true",
dest="duplicate",
default=False,
help="Return a list of duplicate files, based on file hashes. [%(default)s]",
)
search.add_argument(
"--visdup",
action="store_true",
dest="visual_duplicate",
default=False,
help="Return a list of visually exact duplicate files, based on perceptual hashes. [%(default)s]",
)
search.add_argument(
"--color",
type=str,
dest="nearestcolor",
default=False,
help="Search list for nearest ambient color. format: R,G,B in uint8. Add fourth value to limit search to number of hits. Also accepts format file,hits to find nearest color to given file.",
)
search.add_argument(
"--similar",
type=str,
dest="similarity",
default=None,
help="Search list for similar images. Value 0-255 for similarity threshold. 0=high similarity. "
+ "If value is a filename, search similar to that image. "
+ "Append with ',value' to limit similarity. default to 20."
+ "The output columns: SD SimilarityDiff., CD ColorDiff., "
+ "RD AspectRatioDiff.,Shp SharpnessIndex. This function does not return exact duplicates.",
)
tag.add_argument(
"-t",
action="append",
dest="add_tag",
default=[],
help="Give file a tag.",
)
tag.add_argument(
"-d",
action="append",
dest="delete_tag",
default=[],
help="Delete a tag.",
)
tag.add_argument(
type=str,
dest="file",
default=None,
help="File name for tagging.",
)
options = parser.parse_args()
if options.command == "help":
parser.print_help()
print("\n====\nCommand: db")
db.print_help()
print("\n====\nCommand: search")
search.print_help()
print("\n====\nCommand: du")
du.print_help()
print("\n====\nCommand: tag")
tag.print_help()
sys.exit(0)
if options.command == None:
parser.print_help()
sys.exit(0)
# options = db.parse_args()
# options.command = "db"
# options.sqlfile = SQLFILE
if options.command == "db":
BADDIRS.extend(options.exclude)
return options
def main():
options = setup_options()
il = ImageList(options)
if options.command == "db":
if not options.no_delete:
il.delete_missing()
if not options.no_add:
il.recursive_add()
il.base_add()
if not options.no_delete_data:
il.clean_data()
if options.measure:
il.base_add()
il.measure()
if options.command == "du":
il.disk_used()
if options.command == "search":
if options.duplicate:
il.duplicates()
if options.visual_duplicate:
il.similarity()
if options.nearestcolor:
il.nearestcolor()
if options.similarity:
il.similarity()
if options.command == "tag":
il.tag_manage()
print("")
if __name__ == "__main__":
main()