switch to using p_hash

This commit is contained in:
q
2025-05-16 23:18:17 +03:00
parent ce19e8a146
commit 86c516742e
5 changed files with 274 additions and 148 deletions

View File

@@ -5,11 +5,12 @@ import traceback
from argparse import ArgumentParser from argparse import ArgumentParser
from datetime import datetime from datetime import datetime
import tabulate
from imagelist2.db import DB, sqlite_sqrt, sqlite_square from imagelist2.db import DB, sqlite_sqrt, sqlite_square
from imagelist2.image import ImageMeasure, is_image_extension from imagelist2.image import ImageMeasure, is_image_extension
from tqdm import tqdm from tqdm import tqdm
__version__ = "0.0.3" __version__ = "0.0.4"
SQLFILE = "image-list.sqlite" SQLFILE = "image-list.sqlite"
# IMGMATCH = re.compile("|".join([".*\." + x + "$" |.*\.jpeg$|.*\.png$|.*\.gif$|.*\.tif$", re.I) # IMGMATCH = re.compile("|".join([".*\." + x + "$" |.*\.jpeg$|.*\.png$|.*\.gif$|.*\.tif$", re.I)
BADDIRS = ["_tn", "_med", ".tn", ".med"] BADDIRS = ["_tn", "_med", ".tn", ".med"]
@@ -22,6 +23,7 @@ class ImageList:
self.options = opts self.options = opts
self.db = DB(self.options.sqlfile) self.db = DB(self.options.sqlfile)
self.root_path = os.path.dirname(os.path.realpath(self.options.sqlfile)) self.root_path = os.path.dirname(os.path.realpath(self.options.sqlfile))
self.similarity_header = ("#", "File", "PD", "CD", "RD", "Shp", "W", "H")
def recursive_add(self): def recursive_add(self):
@@ -126,13 +128,7 @@ class ImageList:
cursor.execute( cursor.execute(
"""INSERT INTO data(hash,portrait,width,height,description) """INSERT INTO data(hash,portrait,width,height,description)
VALUES(?,?,?,?,?)""", VALUES(?,?,?,?,?)""",
( (row[0], image.get_portrait(), image.get_width(), image.get_height(), image.get_description()),
row[0],
image.get_portrait(),
image.get_width(),
image.get_height(),
image.get_description()
),
) )
if i % 50 == 0: if i % 50 == 0:
self.db.conn.commit() self.db.conn.commit()
@@ -194,6 +190,8 @@ class ImageList:
list.file, list.file,
data.hash, data.hash,
data.fingerprint, data.fingerprint,
data.w_hash,
data.p_hash,
data.sharpness, data.sharpness,
data.R, data.R,
data.G, data.G,
@@ -223,21 +221,25 @@ class ImageList:
image = ImageMeasure(filename) image = ImageMeasure(filename)
image.hash = row[1] image.hash = row[1]
image.fingerprint = row[2] image.fingerprint = row[2]
image.sharpness = row[3] image.w_hash = row[3]
image.colors["R"] = row[4] image.p_hash = row[4]
image.colors["G"] = row[5] image.sharpness = row[5]
image.colors["B"] = row[6] image.colors["R"] = row[6]
image.colors["BR"] = row[7] image.colors["G"] = row[7]
image.colors["BG"] = row[8] image.colors["B"] = row[8]
image.colors["BB"] = row[9] image.colors["BR"] = row[9]
image.colors["BG"] = row[10]
image.colors["BB"] = row[11]
# Calculate if required # Calculate if required
image.fingerprint = image.get_fingerprint() image.get_fingerprint()
image.sharpness = image.get_sharpness() image.sharpness = image.get_sharpness()
image.colors.update(image.get_colors()) image.colors.update(image.get_colors())
cursor.execute( cursor.execute(
"""UPDATE data SET """UPDATE data SET
fingerprint = ?, fingerprint = ?,
w_hash = ?,
p_hash = ?,
sharpness = ?, sharpness = ?,
R = ?, R = ?,
G = ?, G = ?,
@@ -249,6 +251,8 @@ class ImageList:
""", """,
( (
image.fingerprint, image.fingerprint,
image.w_hash,
image.p_hash,
image.sharpness, image.sharpness,
image.colors["R"], image.colors["R"],
image.colors["G"], image.colors["G"],
@@ -291,8 +295,10 @@ class ImageList:
sizes.append(row[0]) sizes.append(row[0])
else: else:
sizes[entries.index(start_path)] += row[0] sizes[entries.index(start_path)] += row[0]
table = Tabulate(("Size[b]", "Size", "Path"))
for entry in zip(sizes, entries): for entry in zip(sizes, entries):
print("| ".join([str(entry[0]).ljust(14), humanize_size(entry[0]).rjust(8), entry[1]])) table.append((entry[0], humanize_size(entry[0]), entry[1]))
table.print()
def duplicates(self): def duplicates(self):
result = self.db.cursor().execute( result = self.db.cursor().execute(
@@ -311,9 +317,11 @@ class ImageList:
FROM f FROM f
""", """,
) )
table = Tabulate(["#", "File"])
for row in result: for row in result:
c = "=" if row[0] == "0" else ">" c = "==" if row[0] == "0" else f">{row[0]}"
print(c + "|".join(row)) table.append([c, row[1]])
table.print()
def nearestcolor(self): def nearestcolor(self):
"""Find closest matching images to given RGB color""" """Find closest matching images to given RGB color"""
@@ -356,21 +364,19 @@ class ImageList:
""", """,
(src[0], src[1], src[2], src[3], f), (src[0], src[1], src[2], src[3], f),
) )
table = Tabulate(("Path", "Dist", "BR", "BG", "BB"))
print("|".join(("Path", "Dist", "BR", "BG", "BB")))
for hit in result: for hit in result:
p, d, r, g, b = hit p, d, r, g, b = hit
print( table.append(
"|".join( (
( p,
p, str(d),
str(d), str(int(r)),
str(int(r)), str(int(g)),
str(int(g)), str(int(b)),
str(int(b)),
)
) )
) )
table.print()
def similarity(self): def similarity(self):
@@ -378,79 +384,159 @@ class ImageList:
image = ImageMeasure(None) image = ImageMeasure(None)
image.hash = row[0] image.hash = row[0]
image.fingerprint = row[1] image.fingerprint = row[1]
image.sharpness = row[2] image.w_hash = row[2]
image.width = row[3] image.p_hash = row[3]
image.height = row[4] image.sharpness = row[4]
image.colors["R"] = row[5] image.width = row[5]
image.colors["G"] = row[6] image.height = row[6]
image.colors["B"] = row[7] image.colors["R"] = row[7]
image.colors["G"] = row[8]
image.colors["B"] = row[9]
return image return image
def get_matching(cmp_image): def print_visually_similar(file, thr):
cmp_image = ImageMeasure(file)
cmp_image.set_all()
cmp_image.filename = cmp_image.filename
compare_list = self.db.cursor().execute( compare_list = self.db.cursor().execute(
"""SELECT hash,fingerprint,sharpness,width,height,R,G,B """SELECT
FROM data RELATIVE(file),width,height,sharpness,
WHERE fingerprint IS NOT NULL AND sharpness > 0 AND hash != ?""", PDISTANCE(p_hash, ?) AS p_dist,
(cmp_image.hash,), COLORDIFF(R,G,B,?,?,?) AS c_diff,
SHAPEDIFF(width,height,?,?) AS s_diff
FROM files
WHERE p_hash IS NOT NULL AND
sharpness > 0 AND
hash != ? AND
p_dist <= ?
ORDER BY p_dist, file""",
(
cmp_image.p_hash,
cmp_image.colors["R"],
cmp_image.colors["G"],
cmp_image.colors["B"],
cmp_image.width,
cmp_image.height,
cmp_image.hash,
thr,
),
) )
match_list = [] table = Tabulate(self.similarity_header)
for row2 in compare_list: table.append(
other_image = set_image(row2) (
similarity = cmp_image.similarity_difference(other_image) "==",
if similarity <= thr: self.db.file2relative(cmp_image.filename),
other_image.similarity["distance"] = similarity 0,
other_image.similarity["color"] = cmp_image.color_difference(other_image) 0,
other_image.similarity["aspect"] = cmp_image.shape_difference(other_image) 0,
other_image.filename = self.db.hash2file(other_image.hash) cmp_image.sharpness,
match_list.append(other_image) cmp_image.width,
return match_list cmp_image.height,
)
)
for counter, row in enumerate(compare_list):
f2, w2, h2, s2, pdist, cdiff, sdiff = row
table.append((f">{counter+1}", f2, pdist, cdiff, sdiff, s2, w2, h2))
table.print()
def get_visual_duplicates():
def get_duplicates():
return self.db.cursor().execute( return self.db.cursor().execute(
""" """
WITH WITH
duplicates AS (SELECT fingerprint FROM data GROUP BY fingerprint HAVING count(fingerprint) > 1 AND sharpness > 0), duplicates AS (SELECT p_hash FROM data GROUP BY p_hash HAVING count(p_hash) > 1)
duphash AS ( SELECT
SELECT duplicates.fingerprint, data.hash, data.sharpness, data.width, data.height, data.R, data.G, data.B RELATIVE(files.file) AS file,
FROM duplicates files.width,
LEFT JOIN data ON (duplicates.fingerprint = data.fingerprint) files.height,
), files.sharpness,
f AS (SELECT files.R,
duphash.fingerprint, duphash.hash,list.file, files.G,
duphash.sharpness, files.B,
duphash.width, duphash.height, files.p_hash
duphash.R, duphash.G, duphash.B FROM files
FROM duphash WHERE p_hash IN ( SELECT p_hash FROM duplicates )
LEFT JOIN list ON (list.hash = duphash.hash) ORDER BY p_hash, files.size DESC
WHERE list.file IS NOT NULL """
ORDER BY list.file )
def print_visual_duplicates():
fblock = None
counter = 0
table = Tabulate(self.similarity_header)
for row in get_visual_duplicates():
f, w, h, s, r, g, b, p_hash = row
if fblock != p_hash:
fblock = p_hash
counter = 0
table.append(("==", f, 0, 0, 0, s, w, h))
image1 = ImageMeasure(f)
image1.width = w
image1.height = h
image1.sharpness = s
image1.colors.update({"B": b, "G": g, "R": r})
continue
counter += 1
image2 = ImageMeasure(f)
image2.width = w
image2.height = h
image2.sharpness = s
image2.colors.update({"B": b, "G": g, "R": r})
cdiff = image1.color_difference(image2)
sdiff = image1.shape_difference(image2)
table.append((f">{counter}", f, 0, cdiff, sdiff, s, w, h))
table.print()
def print_self_similarity(thr):
fingerprint_list = self.db.cursor().execute(
"""
WITH disttab AS (
WITH
t1 AS ( SELECT * FROM files ),
t2 AS ( SELECT * FROM files )
SELECT
RELATIVE(t1.file) AS file1,
t1.width AS width1,
t1.height AS height1,
t1.sharpness AS sharpness1,
RELATIVE(t2.file) AS file2,
t2.width AS width2,
t2.height AS height2,
t2.sharpness AS sharpness2,
PDISTANCE(t1.p_hash,t2.p_hash) AS p_distance,
COLORDIFF(t1.R,t1.G,t1.B,t2.R,t2.G,t2.B) AS c_diff,
SHAPEDIFF(t1.width,t1.height,t2.width,t2.height) AS s_diff
FROM t1 INNER JOIN t2
ON t1.file < t2.file
WHERE p_distance <= ?
ORDER BY t1.file, p_distance, t2.file
) )
SELECT SELECT * FROM disttab
CAST((row_number() OVER (PARTITION BY f.fingerprint))-1 AS TEXT) AS row, """,
file, (thr,),
hash,
fingerprint,
sharpness,width,height,R,G,B
FROM f
""",
) )
f1block = None
counter = 0
table = Tabulate(self.similarity_header)
for row in fingerprint_list:
f1, w1, h1, s1, f2, w2, h2, s2, pdist, cdiff, sdiff = row
if f1block != f1:
f1block = f1
counter = 0
table.append(("==", f1, 0, 0, 0, s1, w1, h1))
counter += 1
table.append((f">{counter}", f2, pdist, cdiff, sdiff, s2, w2, h2))
table.print()
def print_matching(match_list, cmp_image): def print_similarity_block(rows):
if len(match_list) > 0: if len(rows) > 0:
match_list.sort(key=lambda i: i.similarity["distance"]) for row in rows:
print_similarity_row(cmp_image, "=", 0) pre, f, w, h, s, pdist, cdiff, sdiff = row
for i, img in enumerate(match_list): print(f"{pre}|{f}|{pdist}|{cdiff}|{sdiff}|{s}|{w}|{h}")
print_similarity_row(img, ">", i + 1)
def print_similarity_row(img, c, index):
fnames = ", ".join([self.db.file2relative(f) for f in img.filename])
print(
f"{c}{index}|{fnames}|{img.similarity['distance']}|{img.similarity['color']}|{img.similarity['aspect']}|{img.sharpness}|{img.width}|{img.height}"
)
print("|".join(("#", "File", "SD", "CD", "RD", "Shp", "W", "H")))
if self.options.similarity: if self.options.similarity:
thr = 20
try: try:
thr = int(self.options.similarity) thr = int(self.options.similarity)
file = None file = None
@@ -463,48 +549,14 @@ class ImageList:
file = file[0] file = file[0]
if file is None: if file is None:
# Measure similarity on all files # Similarity inside the dataset
fingerprint_list = self.db.cursor().execute( print_self_similarity(thr)
"""
SELECT hash,fingerprint,sharpness,width,height,R,G,B
FROM data
WHERE fingerprint IS NOT NULL
AND sharpness > 0"""
)
checked = set()
for i, row in enumerate(fingerprint_list):
if row[0] in checked:
continue
cmp_image = set_image(row)
cmp_image.filename = self.db.hash2file(cmp_image.hash)
match_list = get_matching(cmp_image)
for m in match_list:
checked.add(m.hash)
print_matching(match_list, cmp_image)
else: else:
# Read single image, and find similarty to that # Read single image, and find similarty to that
cmp_image = ImageMeasure(file) print_visually_similar(file, thr)
cmp_image.set_all()
cmp_image.filename = [cmp_image.filename]
match_list = get_matching(cmp_image)
print_matching(match_list, cmp_image)
if self.options.visual_duplicate: if self.options.visual_duplicate:
match_list = [] print_visual_duplicates()
for row in get_duplicates():
if row[0] == "0":
if len(match_list) > 0:
print_matching(match_list, cmp_image)
cmp_image = set_image(row[2:])
cmp_image.filename = [row[1]]
match_list = []
else:
other_image = set_image(row[2:])
other_image.filename = [row[1]]
other_image.similarity["color"] = cmp_image.color_difference(other_image)
other_image.similarity["aspect"] = cmp_image.shape_difference(other_image)
match_list.append(other_image)
print_matching(match_list, cmp_image)
def tag_manage(self): def tag_manage(self):
@@ -538,6 +590,30 @@ class ImageList:
print(",".join([x[0] for x in tags])) print(",".join([x[0] for x in tags]))
class Tabulate:
def __init__(self, header):
self.header = header
self.rows = []
def append(self, row):
self.rows.append(row)
def print(self):
tabulate._table_formats["github"] = tabulate.TableFormat(
lineabove=None,
linebelowheader=None,
linebetweenrows=None,
linebelow=None,
headerrow=tabulate.DataRow("|", "|", "|"),
datarow=tabulate.DataRow("|", "|", "|"),
padding=0,
with_header_hide=["lineabove"],
)
print(tabulate.tabulate(self.rows, headers=self.header, tablefmt="github", showindex=False))
def clean_dirs(dirs): def clean_dirs(dirs):
"""Remove in place, because os.walk uses the same variable""" """Remove in place, because os.walk uses the same variable"""
remove = [] remove = []
@@ -694,7 +770,7 @@ def setup_options():
help="Search list for similar images. Value 0-255 for similarity threshold. 0=high similarity. " help="Search list for similar images. Value 0-255 for similarity threshold. 0=high similarity. "
+ "If value is a filename, search similar to that image. " + "If value is a filename, search similar to that image. "
+ "Append with ',value' to limit similarity. default to 20." + "Append with ',value' to limit similarity. default to 20."
+ "The output columns: SD SimilarityDiff., CD ColorDiff., " + "The output columns: PD PerceptualDiff., CD ColorDiff., "
+ "RD AspectRatioDiff.,Shp SharpnessIndex. This function does not return exact duplicates.", + "RD AspectRatioDiff.,Shp SharpnessIndex. This function does not return exact duplicates.",
) )

View File

@@ -1,7 +1,16 @@
import os import os
import sqlite3 import sqlite3
import sys
from math import sqrt as sqlite_sqrt from math import sqrt as sqlite_sqrt
import sqlite_vec
from .image import (
calculate_color_difference,
calculate_phash_distance,
calculate_shape_difference,
)
class DB: class DB:
def __init__(self, sqlfile): def __init__(self, sqlfile):
@@ -26,6 +35,8 @@ class DB:
width INTEGER, width INTEGER,
height INTEGER, height INTEGER,
fingerprint TEXT, fingerprint TEXT,
p_hash TEXT,
w_hash TEXT,
sharpness NUMERIC, sharpness NUMERIC,
R REAL, G REAL, B REAL, BR REAL, BG REAL, BB REAL R REAL, G REAL, B REAL, BR REAL, BG REAL, BB REAL
)""" )"""
@@ -48,6 +59,12 @@ class DB:
conn = sqlite3.connect(self.sqlfile) conn = sqlite3.connect(self.sqlfile)
conn.text_factory = str conn.text_factory = str
conn.create_function("RELATIVE", 1, self.file2relative) conn.create_function("RELATIVE", 1, self.file2relative)
conn.create_function("PDISTANCE", 2, calculate_phash_distance)
conn.create_function("COLORDIFF", 6, calculate_color_difference)
conn.create_function("SHAPEDIFF", 4, calculate_shape_difference)
conn.enable_load_extension(True)
sqlite_vec.load(conn)
conn.enable_load_extension(False)
self.conn = conn self.conn = conn
return conn return conn

View File

@@ -34,6 +34,8 @@ class ImageMeasure:
self.height = None self.height = None
self.portrait = None self.portrait = None
self.fingerprint = None self.fingerprint = None
self.w_hash = None
self.p_hash = None
self.sharpness = None self.sharpness = None
self.colors = {x: None for x in ("R", "G", "B", "BR", "BG", "BB")} self.colors = {x: None for x in ("R", "G", "B", "BR", "BG", "BB")}
self.similarity = {"distance": 0, "color": 0, "aspect": 0} self.similarity = {"distance": 0, "color": 0, "aspect": 0}
@@ -116,7 +118,6 @@ class ImageMeasure:
self.description = read_image_comment(self.filename) self.description = read_image_comment(self.filename)
return self.description return self.description
def get_image(self, image_type="numpy"): def get_image(self, image_type="numpy"):
if self.image is None: if self.image is None:
@@ -136,11 +137,22 @@ class ImageMeasure:
def get_fingerprint(self): def get_fingerprint(self):
if self.fingerprint is None: if self.fingerprint is None:
# self.fingerprint = str(imagehash.phash(self.get_image("PIL"), hash_size=8)) self.get_w_hash()
self.get_p_hash()
self.fingerprint = str(imagehash.dhash(self.get_image("PIL"), hash_size=8)) self.fingerprint = str(imagehash.dhash(self.get_image("PIL"), hash_size=8))
return self.fingerprint return self.fingerprint
def get_w_hash(self):
if self.w_hash is None:
self.w_hash = str(imagehash.whash(self.get_image("PIL"), hash_size=8))
return self.w_hash
def get_p_hash(self):
if self.p_hash is None:
self.p_hash = str(imagehash.phash(self.get_image("PIL"), hash_size=8))
return self.p_hash
def get_sharpness(self): def get_sharpness(self):
if self.sharpness is None: if self.sharpness is None:
@@ -180,34 +192,54 @@ class ImageMeasure:
return self.colors return self.colors
def similarity_difference(self, other): def similarity_difference(self, other):
try:
other_phash = imagehash.hex_to_hash(other.get_p_hash())
this_phash = imagehash.hex_to_hash(self.get_p_hash())
other_phash = imagehash.hex_to_hash(other.get_fingerprint()) return other_phash - this_phash
this_phash = imagehash.hex_to_hash(self.get_fingerprint()) except Exception as e:
return other_phash - this_phash print(e, file=sys.stderr)
return 255
def color_difference(self, other): def color_difference(self, other):
other_color = other.get_colors() other_color = other.get_colors()
this_color = self.get_colors() this_color = self.get_colors()
diff = round(
np.sqrt( return calculate_color_difference(
np.square(other_color["R"] - this_color["R"]) this_color["R"], this_color["G"], this_color["B"], other_color["R"], other_color["G"], other_color["B"]
+ np.square(other_color["G"] - this_color["G"])
+ np.square(other_color["B"] - this_color["B"])
),
1,
) )
return diff
def shape_difference(self, other): def shape_difference(self, other):
return calculate_shape_difference(self.width, self.height, other.width, other.height)
return round(abs(float(other.width) / float(other.height) - float(self.width) / float(self.height)), 4)
EXTENSIONS = (".jpg", ".png", ".tif", ".gif", ".jpeg", ".tiff") EXTENSIONS = (".jpg", ".png", ".tif", ".gif", ".jpeg", ".tiff")
JPEG_EXTENSIONS = (".jpg", ".jpeg") JPEG_EXTENSIONS = (".jpg", ".jpeg")
def calculate_color_difference(r1, g1, b1, r2, g2, b2):
diff = round(
np.sqrt(np.square(r1 - r2) + np.square(g1 - g2) + np.square(b1 - b2)),
1,
)
return diff
def calculate_shape_difference(w1, h1, w2, h2):
return round(abs(float(w1) / float(h1) - float(w2) / float(h2)), 4)
def calculate_phash_distance(h1, h2):
try:
return imagehash.hex_to_hash(h1) - imagehash.hex_to_hash(h2)
except Exception as e:
print(e, file=sys.stderr)
print((h1, h2), file=sys.stderr)
return 255
def is_image_extension(f): def is_image_extension(f):
return os.path.splitext(f.lower())[1] in EXTENSIONS return os.path.splitext(f.lower())[1] in EXTENSIONS
@@ -234,7 +266,8 @@ def read_image_size(fname):
im = Image.open(fname) im = Image.open(fname)
return im.width, im.height return im.width, im.height
def read_image_comment(fname): def read_image_comment(fname):
"""Just reading the comment with PIL""" """Just reading the comment with PIL"""
im = Image.open(fname) im = Image.open(fname)
return im.info.get('comment','') return im.info.get("comment", "")

View File

@@ -23,7 +23,7 @@ classifiers = [
"Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy", "Programming Language :: Python :: Implementation :: PyPy",
] ]
dependencies = ["PyTurboJPEG", "Pillow", "ImageHash", "numpy", "tqdm"] dependencies = ["PyTurboJPEG", "Pillow", "ImageHash", "numpy", "tqdm", "sqlite-vec", "tabulate"]
[project.scripts] [project.scripts]
image-list = "imagelist2:main" image-list = "imagelist2:main"

View File

@@ -22,5 +22,5 @@ setup(
"image-list = imagelist2:main", "image-list = imagelist2:main",
] ]
}, },
install_requires=["PyTurboJPEG", "Pillow", "ImageHash", "numpy", "tqdm"], install_requires=["PyTurboJPEG", "Pillow", "ImageHash", "numpy", "tqdm", "sqlite-vec", "tabulate"],
) )