switch to using p_hash

This commit is contained in:
q
2025-05-16 23:18:17 +03:00
parent ce19e8a146
commit 86c516742e
5 changed files with 274 additions and 148 deletions

View File

@@ -5,11 +5,12 @@ import traceback
from argparse import ArgumentParser
from datetime import datetime
import tabulate
from imagelist2.db import DB, sqlite_sqrt, sqlite_square
from imagelist2.image import ImageMeasure, is_image_extension
from tqdm import tqdm
__version__ = "0.0.3"
__version__ = "0.0.4"
SQLFILE = "image-list.sqlite"
# IMGMATCH = re.compile("|".join([".*\." + x + "$" |.*\.jpeg$|.*\.png$|.*\.gif$|.*\.tif$", re.I)
BADDIRS = ["_tn", "_med", ".tn", ".med"]
@@ -22,6 +23,7 @@ class ImageList:
self.options = opts
self.db = DB(self.options.sqlfile)
self.root_path = os.path.dirname(os.path.realpath(self.options.sqlfile))
self.similarity_header = ("#", "File", "PD", "CD", "RD", "Shp", "W", "H")
def recursive_add(self):
@@ -126,13 +128,7 @@ class ImageList:
cursor.execute(
"""INSERT INTO data(hash,portrait,width,height,description)
VALUES(?,?,?,?,?)""",
(
row[0],
image.get_portrait(),
image.get_width(),
image.get_height(),
image.get_description()
),
(row[0], image.get_portrait(), image.get_width(), image.get_height(), image.get_description()),
)
if i % 50 == 0:
self.db.conn.commit()
@@ -194,6 +190,8 @@ class ImageList:
list.file,
data.hash,
data.fingerprint,
data.w_hash,
data.p_hash,
data.sharpness,
data.R,
data.G,
@@ -223,21 +221,25 @@ class ImageList:
image = ImageMeasure(filename)
image.hash = row[1]
image.fingerprint = row[2]
image.sharpness = row[3]
image.colors["R"] = row[4]
image.colors["G"] = row[5]
image.colors["B"] = row[6]
image.colors["BR"] = row[7]
image.colors["BG"] = row[8]
image.colors["BB"] = row[9]
image.w_hash = row[3]
image.p_hash = row[4]
image.sharpness = row[5]
image.colors["R"] = row[6]
image.colors["G"] = row[7]
image.colors["B"] = row[8]
image.colors["BR"] = row[9]
image.colors["BG"] = row[10]
image.colors["BB"] = row[11]
# Calculate if required
image.fingerprint = image.get_fingerprint()
image.get_fingerprint()
image.sharpness = image.get_sharpness()
image.colors.update(image.get_colors())
cursor.execute(
"""UPDATE data SET
fingerprint = ?,
w_hash = ?,
p_hash = ?,
sharpness = ?,
R = ?,
G = ?,
@@ -249,6 +251,8 @@ class ImageList:
""",
(
image.fingerprint,
image.w_hash,
image.p_hash,
image.sharpness,
image.colors["R"],
image.colors["G"],
@@ -291,8 +295,10 @@ class ImageList:
sizes.append(row[0])
else:
sizes[entries.index(start_path)] += row[0]
table = Tabulate(("Size[b]", "Size", "Path"))
for entry in zip(sizes, entries):
print("| ".join([str(entry[0]).ljust(14), humanize_size(entry[0]).rjust(8), entry[1]]))
table.append((entry[0], humanize_size(entry[0]), entry[1]))
table.print()
def duplicates(self):
result = self.db.cursor().execute(
@@ -311,9 +317,11 @@ class ImageList:
FROM f
""",
)
table = Tabulate(["#", "File"])
for row in result:
c = "=" if row[0] == "0" else ">"
print(c + "|".join(row))
c = "==" if row[0] == "0" else f">{row[0]}"
table.append([c, row[1]])
table.print()
def nearestcolor(self):
"""Find closest matching images to given RGB color"""
@@ -356,21 +364,19 @@ class ImageList:
""",
(src[0], src[1], src[2], src[3], f),
)
print("|".join(("Path", "Dist", "BR", "BG", "BB")))
table = Tabulate(("Path", "Dist", "BR", "BG", "BB"))
for hit in result:
p, d, r, g, b = hit
print(
"|".join(
(
p,
str(d),
str(int(r)),
str(int(g)),
str(int(b)),
)
table.append(
(
p,
str(d),
str(int(r)),
str(int(g)),
str(int(b)),
)
)
table.print()
def similarity(self):
@@ -378,79 +384,159 @@ class ImageList:
image = ImageMeasure(None)
image.hash = row[0]
image.fingerprint = row[1]
image.sharpness = row[2]
image.width = row[3]
image.height = row[4]
image.colors["R"] = row[5]
image.colors["G"] = row[6]
image.colors["B"] = row[7]
image.w_hash = row[2]
image.p_hash = row[3]
image.sharpness = row[4]
image.width = row[5]
image.height = row[6]
image.colors["R"] = row[7]
image.colors["G"] = row[8]
image.colors["B"] = row[9]
return image
def get_matching(cmp_image):
def print_visually_similar(file, thr):
cmp_image = ImageMeasure(file)
cmp_image.set_all()
cmp_image.filename = cmp_image.filename
compare_list = self.db.cursor().execute(
"""SELECT hash,fingerprint,sharpness,width,height,R,G,B
FROM data
WHERE fingerprint IS NOT NULL AND sharpness > 0 AND hash != ?""",
(cmp_image.hash,),
"""SELECT
RELATIVE(file),width,height,sharpness,
PDISTANCE(p_hash, ?) AS p_dist,
COLORDIFF(R,G,B,?,?,?) AS c_diff,
SHAPEDIFF(width,height,?,?) AS s_diff
FROM files
WHERE p_hash IS NOT NULL AND
sharpness > 0 AND
hash != ? AND
p_dist <= ?
ORDER BY p_dist, file""",
(
cmp_image.p_hash,
cmp_image.colors["R"],
cmp_image.colors["G"],
cmp_image.colors["B"],
cmp_image.width,
cmp_image.height,
cmp_image.hash,
thr,
),
)
match_list = []
for row2 in compare_list:
other_image = set_image(row2)
similarity = cmp_image.similarity_difference(other_image)
if similarity <= thr:
other_image.similarity["distance"] = similarity
other_image.similarity["color"] = cmp_image.color_difference(other_image)
other_image.similarity["aspect"] = cmp_image.shape_difference(other_image)
other_image.filename = self.db.hash2file(other_image.hash)
match_list.append(other_image)
return match_list
table = Tabulate(self.similarity_header)
table.append(
(
"==",
self.db.file2relative(cmp_image.filename),
0,
0,
0,
cmp_image.sharpness,
cmp_image.width,
cmp_image.height,
)
)
for counter, row in enumerate(compare_list):
f2, w2, h2, s2, pdist, cdiff, sdiff = row
table.append((f">{counter+1}", f2, pdist, cdiff, sdiff, s2, w2, h2))
table.print()
def get_visual_duplicates():
def get_duplicates():
return self.db.cursor().execute(
"""
WITH
duplicates AS (SELECT fingerprint FROM data GROUP BY fingerprint HAVING count(fingerprint) > 1 AND sharpness > 0),
duphash AS (
SELECT duplicates.fingerprint, data.hash, data.sharpness, data.width, data.height, data.R, data.G, data.B
FROM duplicates
LEFT JOIN data ON (duplicates.fingerprint = data.fingerprint)
),
f AS (SELECT
duphash.fingerprint, duphash.hash,list.file,
duphash.sharpness,
duphash.width, duphash.height,
duphash.R, duphash.G, duphash.B
FROM duphash
LEFT JOIN list ON (list.hash = duphash.hash)
WHERE list.file IS NOT NULL
ORDER BY list.file
WITH
duplicates AS (SELECT p_hash FROM data GROUP BY p_hash HAVING count(p_hash) > 1)
SELECT
RELATIVE(files.file) AS file,
files.width,
files.height,
files.sharpness,
files.R,
files.G,
files.B,
files.p_hash
FROM files
WHERE p_hash IN ( SELECT p_hash FROM duplicates )
ORDER BY p_hash, files.size DESC
"""
)
def print_visual_duplicates():
fblock = None
counter = 0
table = Tabulate(self.similarity_header)
for row in get_visual_duplicates():
f, w, h, s, r, g, b, p_hash = row
if fblock != p_hash:
fblock = p_hash
counter = 0
table.append(("==", f, 0, 0, 0, s, w, h))
image1 = ImageMeasure(f)
image1.width = w
image1.height = h
image1.sharpness = s
image1.colors.update({"B": b, "G": g, "R": r})
continue
counter += 1
image2 = ImageMeasure(f)
image2.width = w
image2.height = h
image2.sharpness = s
image2.colors.update({"B": b, "G": g, "R": r})
cdiff = image1.color_difference(image2)
sdiff = image1.shape_difference(image2)
table.append((f">{counter}", f, 0, cdiff, sdiff, s, w, h))
table.print()
def print_self_similarity(thr):
fingerprint_list = self.db.cursor().execute(
"""
WITH disttab AS (
WITH
t1 AS ( SELECT * FROM files ),
t2 AS ( SELECT * FROM files )
SELECT
RELATIVE(t1.file) AS file1,
t1.width AS width1,
t1.height AS height1,
t1.sharpness AS sharpness1,
RELATIVE(t2.file) AS file2,
t2.width AS width2,
t2.height AS height2,
t2.sharpness AS sharpness2,
PDISTANCE(t1.p_hash,t2.p_hash) AS p_distance,
COLORDIFF(t1.R,t1.G,t1.B,t2.R,t2.G,t2.B) AS c_diff,
SHAPEDIFF(t1.width,t1.height,t2.width,t2.height) AS s_diff
FROM t1 INNER JOIN t2
ON t1.file < t2.file
WHERE p_distance <= ?
ORDER BY t1.file, p_distance, t2.file
)
SELECT
CAST((row_number() OVER (PARTITION BY f.fingerprint))-1 AS TEXT) AS row,
file,
hash,
fingerprint,
sharpness,width,height,R,G,B
FROM f
""",
SELECT * FROM disttab
""",
(thr,),
)
f1block = None
counter = 0
table = Tabulate(self.similarity_header)
for row in fingerprint_list:
f1, w1, h1, s1, f2, w2, h2, s2, pdist, cdiff, sdiff = row
if f1block != f1:
f1block = f1
counter = 0
table.append(("==", f1, 0, 0, 0, s1, w1, h1))
counter += 1
table.append((f">{counter}", f2, pdist, cdiff, sdiff, s2, w2, h2))
table.print()
def print_matching(match_list, cmp_image):
if len(match_list) > 0:
match_list.sort(key=lambda i: i.similarity["distance"])
print_similarity_row(cmp_image, "=", 0)
for i, img in enumerate(match_list):
print_similarity_row(img, ">", i + 1)
def print_similarity_block(rows):
if len(rows) > 0:
for row in rows:
pre, f, w, h, s, pdist, cdiff, sdiff = row
print(f"{pre}|{f}|{pdist}|{cdiff}|{sdiff}|{s}|{w}|{h}")
def print_similarity_row(img, c, index):
fnames = ", ".join([self.db.file2relative(f) for f in img.filename])
print(
f"{c}{index}|{fnames}|{img.similarity['distance']}|{img.similarity['color']}|{img.similarity['aspect']}|{img.sharpness}|{img.width}|{img.height}"
)
print("|".join(("#", "File", "SD", "CD", "RD", "Shp", "W", "H")))
if self.options.similarity:
thr = 20
try:
thr = int(self.options.similarity)
file = None
@@ -463,48 +549,14 @@ class ImageList:
file = file[0]
if file is None:
# Measure similarity on all files
fingerprint_list = self.db.cursor().execute(
"""
SELECT hash,fingerprint,sharpness,width,height,R,G,B
FROM data
WHERE fingerprint IS NOT NULL
AND sharpness > 0"""
)
checked = set()
for i, row in enumerate(fingerprint_list):
if row[0] in checked:
continue
cmp_image = set_image(row)
cmp_image.filename = self.db.hash2file(cmp_image.hash)
match_list = get_matching(cmp_image)
for m in match_list:
checked.add(m.hash)
print_matching(match_list, cmp_image)
# Similarity inside the dataset
print_self_similarity(thr)
else:
# Read single image, and find similarty to that
cmp_image = ImageMeasure(file)
cmp_image.set_all()
cmp_image.filename = [cmp_image.filename]
match_list = get_matching(cmp_image)
print_matching(match_list, cmp_image)
print_visually_similar(file, thr)
if self.options.visual_duplicate:
match_list = []
for row in get_duplicates():
if row[0] == "0":
if len(match_list) > 0:
print_matching(match_list, cmp_image)
cmp_image = set_image(row[2:])
cmp_image.filename = [row[1]]
match_list = []
else:
other_image = set_image(row[2:])
other_image.filename = [row[1]]
other_image.similarity["color"] = cmp_image.color_difference(other_image)
other_image.similarity["aspect"] = cmp_image.shape_difference(other_image)
match_list.append(other_image)
print_matching(match_list, cmp_image)
print_visual_duplicates()
def tag_manage(self):
@@ -538,6 +590,30 @@ class ImageList:
print(",".join([x[0] for x in tags]))
class Tabulate:
def __init__(self, header):
self.header = header
self.rows = []
def append(self, row):
self.rows.append(row)
def print(self):
tabulate._table_formats["github"] = tabulate.TableFormat(
lineabove=None,
linebelowheader=None,
linebetweenrows=None,
linebelow=None,
headerrow=tabulate.DataRow("|", "|", "|"),
datarow=tabulate.DataRow("|", "|", "|"),
padding=0,
with_header_hide=["lineabove"],
)
print(tabulate.tabulate(self.rows, headers=self.header, tablefmt="github", showindex=False))
def clean_dirs(dirs):
"""Remove in place, because os.walk uses the same variable"""
remove = []
@@ -694,7 +770,7 @@ def setup_options():
help="Search list for similar images. Value 0-255 for similarity threshold. 0=high similarity. "
+ "If value is a filename, search similar to that image. "
+ "Append with ',value' to limit similarity. default to 20."
+ "The output columns: SD SimilarityDiff., CD ColorDiff., "
+ "The output columns: PD PerceptualDiff., CD ColorDiff., "
+ "RD AspectRatioDiff.,Shp SharpnessIndex. This function does not return exact duplicates.",
)