Files
q-tools/py-packages/imagelist2/imagelist2/__init__.py
2025-06-09 14:18:41 +03:00

927 lines
29 KiB
Python

import os
import re
import sys
import traceback
from argparse import ArgumentParser
from datetime import datetime
import tabulate
from imagelist2.db import DB, DBCachedWriter, sqlite_sqrt, sqlite_square
from imagelist2.image import ImageBrokenError, ImageMeasure, is_image_extension
from tqdm import tqdm
__version__ = "0.0.8"
SQLFILE = "image-list.sqlite"
BADDIRS = ["_tn", "_med", ".tn", ".med"]
MINSIZE = 0
class ImageList:
def __init__(self, opts):
self.options = opts
self.db = DB(self.options.sqlfile)
self.db_writer = DBCachedWriter(self.db)
self.root_path = os.path.dirname(os.path.realpath(self.options.sqlfile))
self.similarity_header = ("#", "File", "PD", "CD", "RD", "Shp", "W", "H")
def recursive_add(self):
dir_count = 0
image_count = 0
for path, dirs, files in os.walk(os.path.realpath(self.options.startpath), followlinks=self.options.symlinks):
clean_dirs(dirs)
dir_count += 1
progress = tqdm(
total=dir_count,
desc="Directories",
position=0,
delay=1,
leave=False,
)
for path, dirs, files in os.walk(os.path.realpath(self.options.startpath), followlinks=self.options.symlinks):
progress.update()
progress.write(self.db.file2relative(path))
clean_dirs(dirs)
files = [os.path.realpath(os.path.join(path, f)) for f in files]
if not self.options.symlinks:
files = clean_syms(files)
files.sort()
dirs.sort()
db_files = self.db.get_folder_contents(path + "/")
for file in tqdm(files, desc="Files", delay=1, position=1, leave=False):
if not is_image_extension(file):
continue
image = ImageMeasure(file)
if file in db_files:
if self.options.changed:
has_changed = False
if self.db.is_time_mismatch(image):
has_changed = True
if not has_changed:
if self.db.is_hash_mismatch(image):
has_changed = True
if has_changed:
image_count += 1
self.add_single(image, change=True)
else:
if not self.options.no_add:
image_count += 1
self.add_single(image, change=False)
self.db_writer.commit()
if image_count > 0:
print(f"Added/changed {image_count} images")
return
def add_single(self, image, change=False):
if change:
query = "UPDATE list SET hash=?, date=? ,size=? WHERE file=?"
error_msg = f"error adding image: {image.filename}"
else:
query = "INSERT INTO list(hash,date,size,file) VALUES (?,?,?,?)"
error_msg = f"error changing image: {image.filename}"
try:
self.db_writer.execute(
query,
(
image.get_hash(),
image.get_time(),
image.get_size(),
image.filename,
),
)
except:
print(error_msg)
traceback.print_exc(file=sys.stdout)
sys.exit(1)
def base_add(self):
seen_hash = set()
missing_base = (
self.db.cursor()
.execute(
"""
SELECT list.hash, list.file
FROM list
LEFT JOIN data ON data.hash = list.hash
WHERE data.hash IS NULL AND data.broken IS NULL
"""
)
.fetchall()
)
if len(missing_base) == 0:
return
for i, row in enumerate(tqdm(missing_base, desc="Base info", delay=1)):
if row[0] in seen_hash:
continue
seen_hash.add(row[0])
filename = row[1]
if filename == None:
continue
image = ImageMeasure(filename)
if image.is_broken():
self.db_writer.execute(
"""INSERT INTO data(hash,broken)
VALUES(?,?)""",
(row[0], True),
)
else:
self.db_writer.execute(
"""INSERT INTO data(hash,portrait,width,height,description,broken)
VALUES(?,?,?,?,?,?)""",
(
row[0],
image.get_portrait(),
image.get_width(),
image.get_height(),
image.get_description(),
False,
),
)
self.db_writer.commit()
return
def delete_missing(self):
to_delete = []
for row in tqdm(self.db.cursor().execute("SELECT file FROM list").fetchall(), delay=1, desc="Clean files"):
if not os.path.exists(row[0]):
to_delete.append(row[0])
cursor = self.db.cursor()
for file in tqdm(to_delete, desc="Cleaning", delay=1):
cursor.execute("DELETE FROM list where file == ?", (file,))
self.db.conn.commit()
if len(to_delete) > 0:
print(f"Cleaned {len(to_delete)} images")
return
def clean_data(self):
to_delete = []
for row in self.db.cursor().execute(
"""SELECT data.hash FROM data LEFT JOIN list ON list.hash = data.hash WHERE list.hash IS NULL OR data.broken IS NULL"""
):
to_delete.append(row[0])
if len(to_delete) > 0:
cursor = self.db.cursor()
for row in to_delete:
cursor.execute("DELETE FROM data where hash = ?", (row,))
self.db.conn.commit()
print(f"Cleaned {len(to_delete)} metadata")
to_delete = []
for row in self.db.cursor().execute(
"""SELECT tags.hash FROM tags LEFT JOIN list ON list.hash = tags.hash WHERE list.hash IS NULL"""
):
to_delete.append(row[0])
if len(to_delete) > 0:
cursor = self.db.cursor()
for row in to_delete:
cursor.execute("DELETE FROM tags where hash = ?", (row,))
self.db.conn.commit()
print(f"Cleaned {len(to_delete)} tags")
def measure(self):
duplicates = set()
missing_measurements = (
self.db.cursor()
.execute(
"""
SELECT
list.file,
data.hash,
data.p_hash,
data.sharpness,
data.R,
data.G,
data.B,
data.BR,
data.BG,
data.BB
FROM data
LEFT JOIN list ON data.hash = list.hash
WHERE
(
data.p_hash IS NULL
OR data.sharpness IS NULL
OR data.R IS NULL
)
AND
(
data.broken IS FALSE OR data.broken IS NULL
)
"""
)
.fetchall()
)
if len(missing_measurements) == 0:
return
for i, row in enumerate(tqdm(missing_measurements, desc="Measure", delay=1, smoothing=0.01)):
filename = row[0]
if filename == None:
continue
if row[1] in duplicates:
continue
duplicates.add(row[1])
try:
image = ImageMeasure(filename)
(
image.hash,
image.p_hash,
image.sharpness,
image.colors["R"],
image.colors["G"],
image.colors["B"],
image.colors["BR"],
image.colors["BG"],
image.colors["BB"],
) = row[1:]
# Calculate if required
image.get_p_hash()
image.sharpness = image.get_sharpness()
image.colors.update(image.get_colors())
if image.broken:
print("image broke")
raise ImageBrokenError()
except ImageBrokenError:
self.db_writer.execute(
"""UPDATE data SET broken = ?
WHERE hash = ?
""",
(
image.broken,
image.hash,
),
)
continue
self.db_writer.execute(
"""UPDATE data SET
p_hash = ?,
sharpness = ?,
R = ?,
G = ?,
B = ?,
BR = ?,
BG = ?,
BB = ?,
broken = ?
WHERE hash = ?
""",
(
image.p_hash,
image.sharpness,
image.colors["R"],
image.colors["G"],
image.colors["B"],
image.colors["BR"],
image.colors["BG"],
image.colors["BB"],
image.broken,
image.hash,
),
)
self.db_writer.commit()
return
def disk_used(self):
if self.options.diskused_depth is None:
self.options.diskused_depth = 9999999999
searchpath = os.path.realpath(self.options.path)
# self.options.diskused_depth += 1
result = self.db.cursor().execute(
"SELECT size, REPLACE(file,?,'') as path FROM list WHERE file LIKE ?",
(
searchpath + "/",
searchpath + "%",
),
)
entries = []
sizes = []
for row in result:
start_path = row[1].split("/")
start_path = "/".join(start_path[0 : int(self.options.diskused_depth)])
if len(start_path) != len(row[1]):
start_path += "/"
if start_path not in entries:
entries.append(start_path)
sizes.append(row[0])
else:
sizes[entries.index(start_path)] += row[0]
table = Tabulate(("Size[b]", "Size", "Path"))
for entry in zip(sizes, entries):
table.append((entry[0], humanize_size(entry[0]), entry[1]))
table.print()
def broken(self):
result = self.db.cursor().execute(
"""
SELECT
file FROM files
WHERE broken IS TRUE
""",
)
print("#File")
for row in result:
print(row[0])
def db_print(self):
result = self.db.cursor().execute(
"""
SELECT * FROM files
""",
)
table = Tabulate([c[0] for c in result.description])
for row in result:
table.append(row)
table.print()
def duplicates(self):
result = self.db.cursor().execute(
"""
WITH
duplicates AS (SELECT hash FROM list GROUP BY hash HAVING count(hash) > 1),
f AS (SELECT
list.hash,list.file FROM list
LEFT JOIN duplicates ON (list.hash = duplicates.hash)
WHERE duplicates.hash IS NOT NULL
ORDER BY file
)
SELECT
CAST((row_number() OVER (PARTITION BY f.hash))-1 AS TEXT) AS row,
RELATIVE(file)
FROM f
""",
)
table = Tabulate(["#", "File"])
for row in result:
c = "==" if row[0] == "0" else f">{row[0]}"
table.append([c, row[1]])
table.print()
def nearestcolor(self):
"""Find closest matching images to given RGB color"""
src = self.options.nearestcolor
try:
src = [int(i) for i in src.strip().strip('"').split(",")]
if len(src) == 3:
src.append(1)
f = ""
except ValueError:
src = src.strip().split(",")
if len(src) == 1:
limit = 1
else:
limit = int(src[1])
image = ImageMeasure(src[0])
colors = image.get_colors()
f = os.path.realpath(src[0])
src = (colors["R"], colors["G"], colors["B"], limit)
self.db.conn.create_function("SQUARE", 1, sqlite_square)
self.db.conn.create_function("SQRT", 1, sqlite_sqrt)
result = self.db.cursor().execute(
"""
WITH distances AS (
SELECT
hash,
ROUND(SQRT(SQUARE(BR-?)+SQUARE(BG-?)+SQUARE(BB-?)),1) as distance,
BR,BG,BB
FROM data
WHERE BR IS NOT NULL
ORDER BY distance
LIMIT ?
)
SELECT
RELATIVE(list.file),
distances.distance,
distances.BR,
distances.BG,
distances.BB
FROM list
LEFT JOIN
distances ON (distances.hash = list.hash)
WHERE distances.hash IS NOT NULL AND list.file != ?
ORDER BY distances.distance
""",
(src[0], src[1], src[2], src[3], f),
)
table = Tabulate(("Path", "Dist", "BR", "BG", "BB"))
for hit in result:
p, d, r, g, b = hit
table.append(
(
p,
str(d),
str(int(r)),
str(int(g)),
str(int(b)),
)
)
table.print()
def similarity(self):
def print_visually_similar(file, thr):
cmp_image = ImageMeasure(file)
cmp_image.set_all()
cmp_image.filename = cmp_image.filename
compare_list = self.db.cursor().execute(
"""SELECT
RELATIVE(file),width,height,sharpness,
PDISTANCE(p_hash, ?) AS p_dist,
COLORDIFF(R,G,B,?,?,?) AS c_diff,
SHAPEDIFF(width,height,?,?) AS s_diff
FROM files
WHERE p_hash IS NOT NULL AND
sharpness > 0 AND
hash != ? AND
p_dist <= ?
ORDER BY p_dist, file""",
(
cmp_image.p_hash,
cmp_image.colors["R"],
cmp_image.colors["G"],
cmp_image.colors["B"],
cmp_image.width,
cmp_image.height,
cmp_image.hash,
thr,
),
)
table = Tabulate(self.similarity_header)
table.append(
(
"==",
self.db.file2relative(cmp_image.filename),
0,
0,
0,
cmp_image.sharpness,
cmp_image.width,
cmp_image.height,
)
)
for counter, row in enumerate(compare_list):
f2, w2, h2, s2, pdist, cdiff, sdiff = row
table.append((f">{counter+1}", f2, pdist, cdiff, sdiff, s2, w2, h2))
table.print()
def get_visual_duplicates():
return self.db.cursor().execute(
"""
WITH
duplicates AS (SELECT p_hash FROM data WHERE p_hash IS NOT NULL GROUP BY p_hash HAVING count(p_hash) > 1)
SELECT
RELATIVE(files.file) AS file,
files.width,
files.height,
files.sharpness,
files.R,
files.G,
files.B,
files.p_hash
FROM files
WHERE p_hash IN ( SELECT p_hash FROM duplicates )
ORDER BY p_hash, files.size DESC
"""
)
def print_visual_duplicates():
fblock = None
counter = 0
table = Tabulate(self.similarity_header)
for row in get_visual_duplicates():
f, w, h, s, r, g, b, p_hash = row
if fblock != p_hash:
fblock = p_hash
counter = 0
table.append(("==", f, 0, 0, 0, s, w, h))
image1 = ImageMeasure(f)
image1.width = w
image1.height = h
image1.sharpness = s
image1.colors.update({"B": b, "G": g, "R": r})
continue
counter += 1
image2 = ImageMeasure(f)
image2.width = w
image2.height = h
image2.sharpness = s
image2.colors.update({"B": b, "G": g, "R": r})
cdiff = image1.color_difference(image2)
sdiff = image1.shape_difference(image2)
table.append((f">{counter}", f, 0, cdiff, sdiff, s, w, h))
table.print()
def print_self_similarity(thr):
fingerprint_list = self.db.cursor().execute(
"""
WITH disttab AS (
WITH
t1 AS ( SELECT * FROM files WHERE p_hash IS NOT NULL ),
t2 AS ( SELECT * FROM files WHERE p_hash IS NOT NULL )
SELECT
RELATIVE(t1.file) AS file1,
t1.width AS width1,
t1.height AS height1,
t1.sharpness AS sharpness1,
RELATIVE(t2.file) AS file2,
t2.width AS width2,
t2.height AS height2,
t2.sharpness AS sharpness2,
PDISTANCE(t1.p_hash,t2.p_hash) AS p_distance,
t1.R AS t1r, t1.G AS t1g, t1.B AS t1b,
t2.R AS t2r, t2.G AS t2g, t2.B AS t2b
FROM t1 INNER JOIN t2
ON t1.file < t2.file
WHERE p_distance <= ?
ORDER BY t1.file, p_distance, t2.file
)
SELECT
file1,width1,height1,sharpness1,
file2,width2,height2,sharpness2,
p_distance,
ROUND(SQRT((t1r-t2r)*(t1r-t2r)+(t1g-t2g)*(t1g-t2g)+(t1b-t2b)*(t1b-t2b)),1) AS c_diff,
ROUND(ABS((CAST(width1 AS FLOAT) / CAST(height1 AS FLOAT)) - (CAST(width2 AS FLOAT) / CAST(height2 AS FLOAT))), 4) AS s_diff
FROM disttab
""",
(thr,),
)
f1block = None
counter = 0
table = Tabulate(self.similarity_header)
for row in fingerprint_list:
f1, w1, h1, s1, f2, w2, h2, s2, pdist, cdiff, sdiff = row
if f1block != f1:
f1block = f1
counter = 0
table.append(("==", f1, 0, 0, 0, s1, w1, h1))
counter += 1
table.append((f">{counter}", f2, pdist, cdiff, sdiff, s2, w2, h2))
table.print()
def print_similarity_block(rows):
if len(rows) > 0:
for row in rows:
pre, f, w, h, s, pdist, cdiff, sdiff = row
print(f"{pre}|{f}|{pdist}|{cdiff}|{sdiff}|{s}|{w}|{h}")
if self.options.similarity:
thr = 20
try:
thr = int(self.options.similarity)
file = None
except ValueError:
file = self.options.similarity.split(",")
if len(file) == 1:
thr = 20
else:
thr = int(file[1])
file = file[0]
if file is None:
# Similarity inside the dataset
print_self_similarity(thr)
else:
# Read single image, and find similarty to that
print_visually_similar(file, thr)
if self.options.visual_duplicate:
print_visual_duplicates()
def tag_manage(self):
fname = os.path.realpath(self.options.file)
hash = self.db.file2hash(fname)
if hash is None:
raise Exception(f"{fname} not in database")
for add in self.options.add_tag:
self.db.cursor().execute(
"INSERT INTO tags(hash,tag) VALUES (?,?)",
(hash, add),
)
for rm in self.options.delete_tag:
self.db.cursor().execute(
"DELETE FROM tags WHERE hash = ? AND tag = ?",
(hash, rm),
)
if len(self.options.add_tag) + len(self.options.delete_tag) > 0:
self.db.conn.commit()
tags = self.db.cursor().execute(
"""
SELECT
tags.tag
FROM tags
WHERE tags.hash = ?
""",
(hash,),
)
print(",".join([x[0] for x in tags]))
class Tabulate:
def __init__(self, header):
self.header = header
self.rows = []
def append(self, row):
self.rows.append(row)
def print(self):
tabulate._table_formats["github"] = tabulate.TableFormat(
lineabove=None,
linebelowheader=None,
linebetweenrows=None,
linebelow=None,
headerrow=tabulate.DataRow("|", "|", "|"),
datarow=tabulate.DataRow("|", "|", "|"),
padding=0,
with_header_hide=["lineabove"],
)
print(tabulate.tabulate(self.rows, headers=self.header, tablefmt="github", showindex=False))
def clean_dirs(dirs):
"""Remove in place, because os.walk uses the same variable"""
remove = []
for i, s in enumerate(dirs):
if (s in BADDIRS) or s.startswith("."):
remove.append(i)
for r in sorted(remove, reverse=True):
del dirs[r]
def clean_syms(files):
return [f for f in files if not os.path.islink(f)]
def humanize_size(size, precision=1):
if size == None:
return "nan"
suffixes = ["B", "KB", "MB", "GB", "TB", "PB", "EB"]
suffixIndex = 0
defPrecision = 0
while size > 1024:
suffixIndex += 1
size = size / 1024.0
defPrecision = precision
return "%.*f%s" % (defPrecision, size, suffixes[suffixIndex])
def humanize_date(date):
if date == None:
return ""
return datetime.fromtimestamp(int(date)).strftime("%Y-%m-%d %H:%M:%S")
def setup_options():
parser = ArgumentParser(description=f"Maintains a list of images sqlite file (v{__version__})")
parser.add_argument(
"-f",
action="store",
dest="sqlfile",
default=SQLFILE,
help="SQL file name to use [%(default)s]",
)
subparsers = parser.add_subparsers(title="Command", dest="command")
help = subparsers.add_parser("help", help="Help on all commands")
db = subparsers.add_parser("db", help="Update database")
search = subparsers.add_parser("search", help="Search similarity")
du = subparsers.add_parser("du", help="Disk usage")
tag = subparsers.add_parser("tag", help="Tag manager")
db.add_argument(
"--no-add",
"-a",
action="store_true",
dest="no_add",
default=False,
help="Do not add new files [%(default)s]",
)
db.add_argument(
"--measure",
"-m",
action="store_true",
dest="measure",
default=False,
help="Measure various statistics for similarity/color searches. [%(default)s]",
)
db.add_argument(
"--changed",
"-c",
action="store_true",
dest="changed",
default=False,
help="Search for changed files and update their entries [%(default)s]",
)
db.add_argument(
"--no-delete",
"-d",
action="store_true",
dest="no_delete",
default=False,
help="Do not delete non-existing entries [%(default)s]",
)
db.add_argument(
"--no-delete-data",
"-D",
action="store_true",
dest="no_delete_data",
default=False,
help="Do not delete unused metadata [%(default)s]",
)
db.add_argument(
"-x",
action="append",
dest="exclude",
default=[],
help="Exclude folder name. This option may be issued several times.",
)
db.add_argument(
"-l",
action="store_true",
dest="symlinks",
default=False,
help="Follow symbolic links [%(default)s]",
)
db.add_argument(
"--print",
action="store_true",
dest="print",
default=False,
help="Print the whole database [%(default)s]",
)
db.add_argument("startpath", action="store", default=".", nargs="?", help="Path to start scanning for images.")
du.add_argument(
"-d",
type=int,
action="store",
dest="diskused_depth",
default=None,
help="Depth of summarization for du.",
)
du.add_argument(
type=str,
action="store",
dest="path",
default=".",
help="Print directory sizes. Argument is the path where directories are listed from.",
nargs="?",
)
search.add_argument(
"--broken",
action="store_true",
dest="broken",
default=False,
help="Return a list of broken files [%(default)s]",
)
search.add_argument(
"--dup",
action="store_true",
dest="duplicate",
default=False,
help="Return a list of duplicate files, based on file hashes. [%(default)s]",
)
search.add_argument(
"--visdup",
action="store_true",
dest="visual_duplicate",
default=False,
help="Return a list of visually exact duplicate files, based on perceptual hashes. [%(default)s]",
)
search.add_argument(
"--color",
type=str,
dest="nearestcolor",
default=False,
help="Search list for nearest ambient color. format: R,G,B in uint8. Add fourth value to limit search to number of hits. Also accepts format file,hits to find nearest color to given file.",
)
search.add_argument(
"--similar",
type=str,
dest="similarity",
default=None,
help="Search list for similar images. Value 0-255 for similarity threshold. 0=high similarity. "
+ "If value is a filename, search similar to that image. "
+ "Append with ',value' to limit similarity. default to 20."
+ "The output columns: PD PerceptualDiff., CD ColorDiff., "
+ "RD AspectRatioDiff.,Shp SharpnessIndex. This function does not return exact duplicates.",
)
tag.add_argument(
"-t",
action="append",
dest="add_tag",
default=[],
help="Give file a tag.",
)
tag.add_argument(
"-d",
action="append",
dest="delete_tag",
default=[],
help="Delete a tag.",
)
tag.add_argument(
type=str,
dest="file",
default=None,
help="File name for tagging.",
)
options = parser.parse_args()
if options.command == "help":
parser.print_help()
print("\n====\nCommand: db")
db.print_help()
print("\n====\nCommand: search")
search.print_help()
print("\n====\nCommand: du")
du.print_help()
print("\n====\nCommand: tag")
tag.print_help()
sys.exit(0)
if options.command == None:
parser.print_help()
sys.exit(0)
# options = db.parse_args()
# options.command = "db"
# options.sqlfile = SQLFILE
if options.command == "db":
BADDIRS.extend(options.exclude)
return options
def main():
options = setup_options()
il = ImageList(options)
if options.command == "db":
if not options.no_add:
il.recursive_add()
il.base_add()
if not options.no_delete:
il.delete_missing()
if not options.no_delete_data:
il.clean_data()
if options.measure:
il.base_add()
il.measure()
if options.print:
il.db_print()
if options.command == "du":
il.disk_used()
if options.command == "search":
if options.duplicate:
il.duplicates()
if options.visual_duplicate:
il.similarity()
if options.nearestcolor:
il.nearestcolor()
if options.similarity:
il.similarity()
if options.broken:
il.broken()
if options.command == "tag":
il.tag_manage()
print("")
if __name__ == "__main__":
main()