From 86c516742e2042b621357809bf86234ac0c0043c Mon Sep 17 00:00:00 2001 From: q Date: Fri, 16 May 2025 23:18:17 +0300 Subject: [PATCH] switch to using p_hash --- py-packages/imagelist2/imagelist2/__init__.py | 336 +++++++++++------- py-packages/imagelist2/imagelist2/db.py | 17 + py-packages/imagelist2/imagelist2/image.py | 65 +++- py-packages/imagelist2/pyproject.toml | 2 +- py-packages/imagelist2/setup.py | 2 +- 5 files changed, 274 insertions(+), 148 deletions(-) diff --git a/py-packages/imagelist2/imagelist2/__init__.py b/py-packages/imagelist2/imagelist2/__init__.py index 3b51d35..3b7a126 100644 --- a/py-packages/imagelist2/imagelist2/__init__.py +++ b/py-packages/imagelist2/imagelist2/__init__.py @@ -5,11 +5,12 @@ import traceback from argparse import ArgumentParser from datetime import datetime +import tabulate from imagelist2.db import DB, sqlite_sqrt, sqlite_square from imagelist2.image import ImageMeasure, is_image_extension from tqdm import tqdm -__version__ = "0.0.3" +__version__ = "0.0.4" SQLFILE = "image-list.sqlite" # IMGMATCH = re.compile("|".join([".*\." + x + "$" |.*\.jpeg$|.*\.png$|.*\.gif$|.*\.tif$", re.I) BADDIRS = ["_tn", "_med", ".tn", ".med"] @@ -22,6 +23,7 @@ class ImageList: self.options = opts self.db = DB(self.options.sqlfile) self.root_path = os.path.dirname(os.path.realpath(self.options.sqlfile)) + self.similarity_header = ("#", "File", "PD", "CD", "RD", "Shp", "W", "H") def recursive_add(self): @@ -126,13 +128,7 @@ class ImageList: cursor.execute( """INSERT INTO data(hash,portrait,width,height,description) VALUES(?,?,?,?,?)""", - ( - row[0], - image.get_portrait(), - image.get_width(), - image.get_height(), - image.get_description() - ), + (row[0], image.get_portrait(), image.get_width(), image.get_height(), image.get_description()), ) if i % 50 == 0: self.db.conn.commit() @@ -194,6 +190,8 @@ class ImageList: list.file, data.hash, data.fingerprint, + data.w_hash, + data.p_hash, data.sharpness, data.R, data.G, @@ -223,21 +221,25 @@ class ImageList: image = ImageMeasure(filename) image.hash = row[1] image.fingerprint = row[2] - image.sharpness = row[3] - image.colors["R"] = row[4] - image.colors["G"] = row[5] - image.colors["B"] = row[6] - image.colors["BR"] = row[7] - image.colors["BG"] = row[8] - image.colors["BB"] = row[9] + image.w_hash = row[3] + image.p_hash = row[4] + image.sharpness = row[5] + image.colors["R"] = row[6] + image.colors["G"] = row[7] + image.colors["B"] = row[8] + image.colors["BR"] = row[9] + image.colors["BG"] = row[10] + image.colors["BB"] = row[11] # Calculate if required - image.fingerprint = image.get_fingerprint() + image.get_fingerprint() image.sharpness = image.get_sharpness() image.colors.update(image.get_colors()) cursor.execute( """UPDATE data SET fingerprint = ?, + w_hash = ?, + p_hash = ?, sharpness = ?, R = ?, G = ?, @@ -249,6 +251,8 @@ class ImageList: """, ( image.fingerprint, + image.w_hash, + image.p_hash, image.sharpness, image.colors["R"], image.colors["G"], @@ -291,8 +295,10 @@ class ImageList: sizes.append(row[0]) else: sizes[entries.index(start_path)] += row[0] + table = Tabulate(("Size[b]", "Size", "Path")) for entry in zip(sizes, entries): - print("| ".join([str(entry[0]).ljust(14), humanize_size(entry[0]).rjust(8), entry[1]])) + table.append((entry[0], humanize_size(entry[0]), entry[1])) + table.print() def duplicates(self): result = self.db.cursor().execute( @@ -311,9 +317,11 @@ class ImageList: FROM f """, ) + table = Tabulate(["#", "File"]) for row in result: - c = "=" if row[0] == "0" else ">" - print(c + "|".join(row)) + c = "==" if row[0] == "0" else f">{row[0]}" + table.append([c, row[1]]) + table.print() def nearestcolor(self): """Find closest matching images to given RGB color""" @@ -356,21 +364,19 @@ class ImageList: """, (src[0], src[1], src[2], src[3], f), ) - - print("|".join(("Path", "Dist", "BR", "BG", "BB"))) + table = Tabulate(("Path", "Dist", "BR", "BG", "BB")) for hit in result: p, d, r, g, b = hit - print( - "|".join( - ( - p, - str(d), - str(int(r)), - str(int(g)), - str(int(b)), - ) + table.append( + ( + p, + str(d), + str(int(r)), + str(int(g)), + str(int(b)), ) ) + table.print() def similarity(self): @@ -378,79 +384,159 @@ class ImageList: image = ImageMeasure(None) image.hash = row[0] image.fingerprint = row[1] - image.sharpness = row[2] - image.width = row[3] - image.height = row[4] - image.colors["R"] = row[5] - image.colors["G"] = row[6] - image.colors["B"] = row[7] + image.w_hash = row[2] + image.p_hash = row[3] + image.sharpness = row[4] + image.width = row[5] + image.height = row[6] + image.colors["R"] = row[7] + image.colors["G"] = row[8] + image.colors["B"] = row[9] return image - def get_matching(cmp_image): - + def print_visually_similar(file, thr): + cmp_image = ImageMeasure(file) + cmp_image.set_all() + cmp_image.filename = cmp_image.filename compare_list = self.db.cursor().execute( - """SELECT hash,fingerprint,sharpness,width,height,R,G,B - FROM data - WHERE fingerprint IS NOT NULL AND sharpness > 0 AND hash != ?""", - (cmp_image.hash,), + """SELECT + RELATIVE(file),width,height,sharpness, + PDISTANCE(p_hash, ?) AS p_dist, + COLORDIFF(R,G,B,?,?,?) AS c_diff, + SHAPEDIFF(width,height,?,?) AS s_diff + FROM files + WHERE p_hash IS NOT NULL AND + sharpness > 0 AND + hash != ? AND + p_dist <= ? + ORDER BY p_dist, file""", + ( + cmp_image.p_hash, + cmp_image.colors["R"], + cmp_image.colors["G"], + cmp_image.colors["B"], + cmp_image.width, + cmp_image.height, + cmp_image.hash, + thr, + ), ) - match_list = [] - for row2 in compare_list: - other_image = set_image(row2) - similarity = cmp_image.similarity_difference(other_image) - if similarity <= thr: - other_image.similarity["distance"] = similarity - other_image.similarity["color"] = cmp_image.color_difference(other_image) - other_image.similarity["aspect"] = cmp_image.shape_difference(other_image) - other_image.filename = self.db.hash2file(other_image.hash) - match_list.append(other_image) - return match_list + table = Tabulate(self.similarity_header) + table.append( + ( + "==", + self.db.file2relative(cmp_image.filename), + 0, + 0, + 0, + cmp_image.sharpness, + cmp_image.width, + cmp_image.height, + ) + ) + for counter, row in enumerate(compare_list): + f2, w2, h2, s2, pdist, cdiff, sdiff = row + table.append((f">{counter+1}", f2, pdist, cdiff, sdiff, s2, w2, h2)) + + table.print() + + def get_visual_duplicates(): - def get_duplicates(): return self.db.cursor().execute( """ - WITH - duplicates AS (SELECT fingerprint FROM data GROUP BY fingerprint HAVING count(fingerprint) > 1 AND sharpness > 0), - duphash AS ( - SELECT duplicates.fingerprint, data.hash, data.sharpness, data.width, data.height, data.R, data.G, data.B - FROM duplicates - LEFT JOIN data ON (duplicates.fingerprint = data.fingerprint) - ), - f AS (SELECT - duphash.fingerprint, duphash.hash,list.file, - duphash.sharpness, - duphash.width, duphash.height, - duphash.R, duphash.G, duphash.B - FROM duphash - LEFT JOIN list ON (list.hash = duphash.hash) - WHERE list.file IS NOT NULL - ORDER BY list.file + WITH + duplicates AS (SELECT p_hash FROM data GROUP BY p_hash HAVING count(p_hash) > 1) + SELECT + RELATIVE(files.file) AS file, + files.width, + files.height, + files.sharpness, + files.R, + files.G, + files.B, + files.p_hash + FROM files + WHERE p_hash IN ( SELECT p_hash FROM duplicates ) + ORDER BY p_hash, files.size DESC + """ + ) + + def print_visual_duplicates(): + fblock = None + counter = 0 + table = Tabulate(self.similarity_header) + for row in get_visual_duplicates(): + f, w, h, s, r, g, b, p_hash = row + if fblock != p_hash: + fblock = p_hash + counter = 0 + table.append(("==", f, 0, 0, 0, s, w, h)) + image1 = ImageMeasure(f) + image1.width = w + image1.height = h + image1.sharpness = s + image1.colors.update({"B": b, "G": g, "R": r}) + continue + counter += 1 + image2 = ImageMeasure(f) + image2.width = w + image2.height = h + image2.sharpness = s + image2.colors.update({"B": b, "G": g, "R": r}) + cdiff = image1.color_difference(image2) + sdiff = image1.shape_difference(image2) + table.append((f">{counter}", f, 0, cdiff, sdiff, s, w, h)) + table.print() + + def print_self_similarity(thr): + fingerprint_list = self.db.cursor().execute( + """ + WITH disttab AS ( + WITH + t1 AS ( SELECT * FROM files ), + t2 AS ( SELECT * FROM files ) + SELECT + RELATIVE(t1.file) AS file1, + t1.width AS width1, + t1.height AS height1, + t1.sharpness AS sharpness1, + RELATIVE(t2.file) AS file2, + t2.width AS width2, + t2.height AS height2, + t2.sharpness AS sharpness2, + PDISTANCE(t1.p_hash,t2.p_hash) AS p_distance, + COLORDIFF(t1.R,t1.G,t1.B,t2.R,t2.G,t2.B) AS c_diff, + SHAPEDIFF(t1.width,t1.height,t2.width,t2.height) AS s_diff + FROM t1 INNER JOIN t2 + ON t1.file < t2.file + WHERE p_distance <= ? + ORDER BY t1.file, p_distance, t2.file ) - SELECT - CAST((row_number() OVER (PARTITION BY f.fingerprint))-1 AS TEXT) AS row, - file, - hash, - fingerprint, - sharpness,width,height,R,G,B - FROM f - """, + SELECT * FROM disttab + """, + (thr,), ) + f1block = None + counter = 0 + table = Tabulate(self.similarity_header) + for row in fingerprint_list: + f1, w1, h1, s1, f2, w2, h2, s2, pdist, cdiff, sdiff = row + if f1block != f1: + f1block = f1 + counter = 0 + table.append(("==", f1, 0, 0, 0, s1, w1, h1)) + counter += 1 + table.append((f">{counter}", f2, pdist, cdiff, sdiff, s2, w2, h2)) + table.print() - def print_matching(match_list, cmp_image): - if len(match_list) > 0: - match_list.sort(key=lambda i: i.similarity["distance"]) - print_similarity_row(cmp_image, "=", 0) - for i, img in enumerate(match_list): - print_similarity_row(img, ">", i + 1) + def print_similarity_block(rows): + if len(rows) > 0: + for row in rows: + pre, f, w, h, s, pdist, cdiff, sdiff = row + print(f"{pre}|{f}|{pdist}|{cdiff}|{sdiff}|{s}|{w}|{h}") - def print_similarity_row(img, c, index): - fnames = ", ".join([self.db.file2relative(f) for f in img.filename]) - print( - f"{c}{index}|{fnames}|{img.similarity['distance']}|{img.similarity['color']}|{img.similarity['aspect']}|{img.sharpness}|{img.width}|{img.height}" - ) - - print("|".join(("#", "File", "SD", "CD", "RD", "Shp", "W", "H"))) if self.options.similarity: + thr = 20 try: thr = int(self.options.similarity) file = None @@ -463,48 +549,14 @@ class ImageList: file = file[0] if file is None: - # Measure similarity on all files - fingerprint_list = self.db.cursor().execute( - """ - SELECT hash,fingerprint,sharpness,width,height,R,G,B - FROM data - WHERE fingerprint IS NOT NULL - AND sharpness > 0""" - ) - checked = set() - for i, row in enumerate(fingerprint_list): - if row[0] in checked: - continue - cmp_image = set_image(row) - cmp_image.filename = self.db.hash2file(cmp_image.hash) - match_list = get_matching(cmp_image) - for m in match_list: - checked.add(m.hash) - print_matching(match_list, cmp_image) + # Similarity inside the dataset + print_self_similarity(thr) else: # Read single image, and find similarty to that - cmp_image = ImageMeasure(file) - cmp_image.set_all() - cmp_image.filename = [cmp_image.filename] - match_list = get_matching(cmp_image) - print_matching(match_list, cmp_image) + print_visually_similar(file, thr) if self.options.visual_duplicate: - match_list = [] - for row in get_duplicates(): - if row[0] == "0": - if len(match_list) > 0: - print_matching(match_list, cmp_image) - cmp_image = set_image(row[2:]) - cmp_image.filename = [row[1]] - match_list = [] - else: - other_image = set_image(row[2:]) - other_image.filename = [row[1]] - other_image.similarity["color"] = cmp_image.color_difference(other_image) - other_image.similarity["aspect"] = cmp_image.shape_difference(other_image) - match_list.append(other_image) - print_matching(match_list, cmp_image) + print_visual_duplicates() def tag_manage(self): @@ -538,6 +590,30 @@ class ImageList: print(",".join([x[0] for x in tags])) +class Tabulate: + def __init__(self, header): + self.header = header + self.rows = [] + + def append(self, row): + self.rows.append(row) + + def print(self): + + tabulate._table_formats["github"] = tabulate.TableFormat( + lineabove=None, + linebelowheader=None, + linebetweenrows=None, + linebelow=None, + headerrow=tabulate.DataRow("|", "|", "|"), + datarow=tabulate.DataRow("|", "|", "|"), + padding=0, + with_header_hide=["lineabove"], + ) + + print(tabulate.tabulate(self.rows, headers=self.header, tablefmt="github", showindex=False)) + + def clean_dirs(dirs): """Remove in place, because os.walk uses the same variable""" remove = [] @@ -694,7 +770,7 @@ def setup_options(): help="Search list for similar images. Value 0-255 for similarity threshold. 0=high similarity. " + "If value is a filename, search similar to that image. " + "Append with ',value' to limit similarity. default to 20." - + "The output columns: SD SimilarityDiff., CD ColorDiff., " + + "The output columns: PD PerceptualDiff., CD ColorDiff., " + "RD AspectRatioDiff.,Shp SharpnessIndex. This function does not return exact duplicates.", ) diff --git a/py-packages/imagelist2/imagelist2/db.py b/py-packages/imagelist2/imagelist2/db.py index e6f421b..5e740c3 100644 --- a/py-packages/imagelist2/imagelist2/db.py +++ b/py-packages/imagelist2/imagelist2/db.py @@ -1,7 +1,16 @@ import os import sqlite3 +import sys from math import sqrt as sqlite_sqrt +import sqlite_vec + +from .image import ( + calculate_color_difference, + calculate_phash_distance, + calculate_shape_difference, +) + class DB: def __init__(self, sqlfile): @@ -26,6 +35,8 @@ class DB: width INTEGER, height INTEGER, fingerprint TEXT, + p_hash TEXT, + w_hash TEXT, sharpness NUMERIC, R REAL, G REAL, B REAL, BR REAL, BG REAL, BB REAL )""" @@ -48,6 +59,12 @@ class DB: conn = sqlite3.connect(self.sqlfile) conn.text_factory = str conn.create_function("RELATIVE", 1, self.file2relative) + conn.create_function("PDISTANCE", 2, calculate_phash_distance) + conn.create_function("COLORDIFF", 6, calculate_color_difference) + conn.create_function("SHAPEDIFF", 4, calculate_shape_difference) + conn.enable_load_extension(True) + sqlite_vec.load(conn) + conn.enable_load_extension(False) self.conn = conn return conn diff --git a/py-packages/imagelist2/imagelist2/image.py b/py-packages/imagelist2/imagelist2/image.py index deb2cab..76438c6 100644 --- a/py-packages/imagelist2/imagelist2/image.py +++ b/py-packages/imagelist2/imagelist2/image.py @@ -34,6 +34,8 @@ class ImageMeasure: self.height = None self.portrait = None self.fingerprint = None + self.w_hash = None + self.p_hash = None self.sharpness = None self.colors = {x: None for x in ("R", "G", "B", "BR", "BG", "BB")} self.similarity = {"distance": 0, "color": 0, "aspect": 0} @@ -116,7 +118,6 @@ class ImageMeasure: self.description = read_image_comment(self.filename) return self.description - def get_image(self, image_type="numpy"): if self.image is None: @@ -136,11 +137,22 @@ class ImageMeasure: def get_fingerprint(self): if self.fingerprint is None: - # self.fingerprint = str(imagehash.phash(self.get_image("PIL"), hash_size=8)) + self.get_w_hash() + self.get_p_hash() self.fingerprint = str(imagehash.dhash(self.get_image("PIL"), hash_size=8)) return self.fingerprint + def get_w_hash(self): + if self.w_hash is None: + self.w_hash = str(imagehash.whash(self.get_image("PIL"), hash_size=8)) + return self.w_hash + + def get_p_hash(self): + if self.p_hash is None: + self.p_hash = str(imagehash.phash(self.get_image("PIL"), hash_size=8)) + return self.p_hash + def get_sharpness(self): if self.sharpness is None: @@ -180,34 +192,54 @@ class ImageMeasure: return self.colors def similarity_difference(self, other): + try: + other_phash = imagehash.hex_to_hash(other.get_p_hash()) + this_phash = imagehash.hex_to_hash(self.get_p_hash()) - other_phash = imagehash.hex_to_hash(other.get_fingerprint()) - this_phash = imagehash.hex_to_hash(self.get_fingerprint()) - return other_phash - this_phash + return other_phash - this_phash + except Exception as e: + print(e, file=sys.stderr) + return 255 def color_difference(self, other): other_color = other.get_colors() this_color = self.get_colors() - diff = round( - np.sqrt( - np.square(other_color["R"] - this_color["R"]) - + np.square(other_color["G"] - this_color["G"]) - + np.square(other_color["B"] - this_color["B"]) - ), - 1, + + return calculate_color_difference( + this_color["R"], this_color["G"], this_color["B"], other_color["R"], other_color["G"], other_color["B"] ) - return diff def shape_difference(self, other): - - return round(abs(float(other.width) / float(other.height) - float(self.width) / float(self.height)), 4) + return calculate_shape_difference(self.width, self.height, other.width, other.height) EXTENSIONS = (".jpg", ".png", ".tif", ".gif", ".jpeg", ".tiff") JPEG_EXTENSIONS = (".jpg", ".jpeg") +def calculate_color_difference(r1, g1, b1, r2, g2, b2): + diff = round( + np.sqrt(np.square(r1 - r2) + np.square(g1 - g2) + np.square(b1 - b2)), + 1, + ) + return diff + + +def calculate_shape_difference(w1, h1, w2, h2): + return round(abs(float(w1) / float(h1) - float(w2) / float(h2)), 4) + + +def calculate_phash_distance(h1, h2): + try: + return imagehash.hex_to_hash(h1) - imagehash.hex_to_hash(h2) + except Exception as e: + print(e, file=sys.stderr) + print((h1, h2), file=sys.stderr) + + return 255 + + def is_image_extension(f): return os.path.splitext(f.lower())[1] in EXTENSIONS @@ -234,7 +266,8 @@ def read_image_size(fname): im = Image.open(fname) return im.width, im.height + def read_image_comment(fname): """Just reading the comment with PIL""" im = Image.open(fname) - return im.info.get('comment','') + return im.info.get("comment", "") diff --git a/py-packages/imagelist2/pyproject.toml b/py-packages/imagelist2/pyproject.toml index 4399f5b..3768fe0 100644 --- a/py-packages/imagelist2/pyproject.toml +++ b/py-packages/imagelist2/pyproject.toml @@ -23,7 +23,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] -dependencies = ["PyTurboJPEG", "Pillow", "ImageHash", "numpy", "tqdm"] +dependencies = ["PyTurboJPEG", "Pillow", "ImageHash", "numpy", "tqdm", "sqlite-vec", "tabulate"] [project.scripts] image-list = "imagelist2:main" diff --git a/py-packages/imagelist2/setup.py b/py-packages/imagelist2/setup.py index 0ca44fa..aca2a13 100644 --- a/py-packages/imagelist2/setup.py +++ b/py-packages/imagelist2/setup.py @@ -22,5 +22,5 @@ setup( "image-list = imagelist2:main", ] }, - install_requires=["PyTurboJPEG", "Pillow", "ImageHash", "numpy", "tqdm"], + install_requires=["PyTurboJPEG", "Pillow", "ImageHash", "numpy", "tqdm", "sqlite-vec", "tabulate"], )