new imagelist implementation

This commit is contained in:
Ville Rantanen
2024-04-07 17:28:15 +03:00
parent aaf6044e83
commit 31ab87b9df
5 changed files with 1285 additions and 0 deletions

View File

@@ -0,0 +1,125 @@
SHELL := /bin/bash
.PHONY:
.ONESHELL:
help: ## *:・゚✧*:・゚✧ This help *:・゚✧*:・゚✧
@printf "\033[36;1m %14s \033[0;32;1m %s\033[0m\n" Target Description
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | \
awk ' \
BEGIN {FS = ":.*?## "}; \
{ if ( $$1 != "-") { \
printf "\033[31;1m[ \033[36;1m%14s \033[31;1m]\033[0;32;1m %s\033[0m\n", $$1, $$2 \
} else { \
printf " \033[0;33;1m=^= %-25s =^=\033[0m\n", $$2 \
} \
} \
'
-: ## Building
all: clean install test
install: ## Run installer
set -e
. useve-runner
useve imagelist2
py-format ./
pip install ./
PRINT_TABLE := 'sqlite3 -header image-list.sqlite "SELECT * FROM files" | tabulate -1 -s "\|"'
PRINT_DATA := 'sqlite3 -header image-list.sqlite "SELECT * FROM data" | tabulate -1 -s "\|"'
PRINT_LIST := 'sqlite3 -header image-list.sqlite "SELECT * FROM list" | tabulate -1 -s "\|"'
test: test-db test-du test-dup test-tag ## Test
test-db:
set -e
. useve-runner
useve imagelist2
echo =================================
mkdir -p folder1/folder2 folder1/.hidden folder1/_med
convert -size 600x300 xc:red red.jpg
cp red.jpg folder1/.hidden/
convert -size 600x300 xc:cyan folder1/cyan.jpg
convert -size 600x300 xc:cyan folder1/cyan.png
cp folder1/cyan.png folder1/cyan_dup2.png
cp folder1/cyan.png folder1/cyan_dup3.png
cp folder1/cyan.jpg folder1/cyan_dup2.jpg
cp folder1/cyan.jpg folder1/cyan_dup3.jpg
convert -size 600x300 plasma: folder1/noisy.png
convert -size 600x300 plasma: -blur 0x3 folder1/blur.png
convert -size 300x600 xc:blue folder1/folder2/blue.tif
convert wizard: folder1/wizard.jpg
convert folder1/wizard.jpg -resize 95%x98% folder1/wizard.mod.jpg
convert folder1/wizard.jpg -flip -resize 95%x98% folder1/wizard.flip.jpg
image-list db -x imagelist2
eval ${PRINT_TABLE}
convert -size 600x600 xc:black folder1/black.png
image-list db -x imagelist2
eval ${PRINT_TABLE}
rm folder1/black.png
image-list db -x imagelist2
eval ${PRINT_TABLE}
mogrify -rotate 90 folder1/cyan.png
image-list db -x imagelist2 -c
eval ${PRINT_LIST}
eval ${PRINT_DATA}
image-list db -x imagelist2
eval ${PRINT_DATA}
image-list db -x imagelist2 --measure
eval ${PRINT_DATA}
eval ${PRINT_TABLE}
echo "========= check sha1 ============="
sqlite3 image-list.sqlite "SELECT hash,file FROM list" -separator ' ' | sha1sum -c -
test-du:
set -e
. useve-runner
useve imagelist2
echo =================================
image-list du
echo =================================
image-list du -d 2
echo =================================
image-list du -d 1
echo =================================
image-list du -d 1 folder1/
test-dup:
set -e
. useve-runner
useve imagelist2
echo ========== duplicates =======================
image-list search --dup
echo ========== visual duplicates =======================
image-list search --visdup
echo ========== nearest red ======================
image-list search --color 255,0,0,10
echo ========== nearest from file ======================
image-list search --color red.jpg,3
echo ========== Similar by dhash ======================
image-list search --similar 30
echo ========== Similar by file ======================
image-list search --similar folder1/wizard.jpg
test-tag:
set -e
. useve-runner
useve imagelist2
echo ========== tag add =======================
image-list tag -t plain -t red red.jpg
image-list tag -t red red.jpg
echo ========== tag list =======================
image-list tag red.jpg
echo ========== tag delete =======================
image-list tag -d red red.jpg
echo ========== tag list fail =======================
image-list tag red.jpg.missing || true
init: ## Init test env
. useve-runner
useve mk imagelist2
clean: ## Clean testfiles
rm -rf folder1 image-list.sqlite red.jpg || true

View File

@@ -0,0 +1,776 @@
import os
import re
import sys
import traceback
from argparse import ArgumentParser
from datetime import datetime
from imagelist2.db import DB, sqlite_sqrt, sqlite_square
from imagelist2.image import ImageMeasure, is_image_extension
from tqdm import tqdm
__version__ = "0.0.1"
SQLFILE = "image-list.sqlite"
# IMGMATCH = re.compile("|".join([".*\." + x + "$" |.*\.jpeg$|.*\.png$|.*\.gif$|.*\.tif$", re.I)
BADDIRS = ["_tn", "_med", ".tn", ".med"]
MINSIZE = 0
class ImageList:
def __init__(self, opts):
self.options = opts
self.db = DB(self.options.sqlfile)
self.root_path = os.path.dirname(os.path.realpath(self.options.sqlfile))
def recursive_add(self):
dir_count = 0
for path, dirs, files in os.walk(os.path.realpath(self.options.startpath), followlinks=self.options.symlinks):
clean_dirs(dirs)
dir_count += 1
progress = tqdm(
total=dir_count,
desc="Directories",
position=0,
delay=1,
leave=False,
)
for path, dirs, files in os.walk(os.path.realpath(self.options.startpath), followlinks=self.options.symlinks):
progress.update()
progress.write(self.db.file2relative(path))
clean_dirs(dirs)
files = [os.path.realpath(os.path.join(path, f)) for f in files]
if not self.options.symlinks:
files = clean_syms(files)
files.sort()
dirs.sort()
db_files = self.db.get_folder_contents(path + "/")
for file in tqdm(files, desc="Files", delay=1, position=1, leave=False):
if not is_image_extension(file):
continue
image = ImageMeasure(file)
if file in db_files:
if self.options.changed:
has_changed = False
if self.db.is_time_mismatch(image):
has_changed = True
if not has_changed:
if self.db.is_hash_mismatch(image):
has_changed = True
if has_changed:
self.add_single(image, change=True)
else:
if not self.options.no_add:
self.add_single(image, change=False)
self.db.conn.commit()
return
def add_single(self, image, change=False):
if change:
query = "UPDATE list SET hash=?, date=? ,size=? WHERE file=?"
error_msg = f"error adding file: {image.filename}"
else:
query = "INSERT INTO list(hash,date,size,file) VALUES (?,?,?,?)"
error_msg = f"error changing file: {image.filename}"
try:
self.db.cursor().execute(
query,
(
image.get_hash(),
image.get_time(),
image.get_size(),
image.filename,
),
)
except:
print(error_msg)
traceback.print_exc(file=sys.stdout)
sys.exit(1)
def base_add(self):
seen_hash = set()
missing_base = (
self.db.cursor()
.execute(
"""
SELECT list.hash, list.file
FROM list
LEFT JOIN data ON data.hash = list.hash
WHERE data.hash IS NULL
"""
)
.fetchall()
)
if len(missing_base) == 0:
return
cursor = self.db.cursor()
for i, row in enumerate(tqdm(missing_base, desc="Base info", delay=1)):
if row[0] in seen_hash:
continue
seen_hash.add(row[0])
filename = row[1]
if filename == None:
continue
image = ImageMeasure(filename)
cursor.execute(
"""INSERT INTO data(hash,portrait,width,height)
VALUES(?,?,?,?)""",
(
row[0],
image.get_portrait(),
image.get_width(),
image.get_height(),
),
)
if i % 50 == 0:
self.db.conn.commit()
self.db.conn.commit()
return
def delete_missing(self):
to_delete = []
for row in tqdm(self.db.cursor().execute("SELECT file FROM list").fetchall(), delay=1, desc="Clean files"):
if not os.path.exists(row[0]):
to_delete.append(row[0])
cursor = self.db.cursor()
for file in tqdm(to_delete, desc="Cleaning", delay=1):
cursor.execute("DELETE FROM list where file == ?", (file,))
self.db.conn.commit()
if len(to_delete) > 0:
print(f"Cleaned {len(to_delete)} files")
return
def clean_data(self):
to_delete = []
for row in self.db.cursor().execute(
"""SELECT data.hash FROM data LEFT JOIN list ON list.hash = data.hash WHERE list.hash IS NULL"""
):
to_delete.append(row[0])
if len(to_delete) > 0:
cursor = self.db.cursor()
for row in to_delete:
cursor.execute("DELETE FROM data where hash = ?", (row,))
self.db.conn.commit()
print(f"Cleaned {len(to_delete)} metadata")
to_delete = []
for row in self.db.cursor().execute(
"""SELECT tags.hash FROM tags LEFT JOIN list ON list.hash = tags.hash WHERE list.hash IS NULL"""
):
to_delete.append(row[0])
if len(to_delete) > 0:
cursor = self.db.cursor()
for row in to_delete:
cursor.execute("DELETE FROM tags where hash = ?", (row,))
self.db.conn.commit()
print(f"Cleaned {len(to_delete)} tags")
def measure(self):
duplicates = set()
missing_measurements = (
self.db.cursor()
.execute(
"""
SELECT
list.file,
data.hash,
data.fingerprint,
data.sharpness,
data.R,
data.G,
data.B,
data.BR,
data.BG,
data.BB
FROM data
LEFT JOIN list ON data.hash = list.hash
WHERE data.fingerprint IS NULL
OR data.sharpness IS NULL
OR data.R IS NULL
"""
)
.fetchall()
)
if len(missing_measurements) == 0:
return
cursor = self.db.cursor()
for i, row in enumerate(tqdm(missing_measurements, desc="Measure", delay=1, smoothing=0.01)):
filename = row[0]
if filename == None:
continue
if row[1] in duplicates:
continue
duplicates.add(row[1])
image = ImageMeasure(filename)
image.hash = row[1]
image.fingerprint = row[2]
image.sharpness = row[3]
image.colors["R"] = row[4]
image.colors["G"] = row[5]
image.colors["B"] = row[6]
image.colors["BR"] = row[7]
image.colors["BG"] = row[8]
image.colors["BB"] = row[9]
# Calculate if required
image.fingerprint = image.get_fingerprint()
image.sharpness = image.get_sharpness()
image.colors.update(image.get_colors())
cursor.execute(
"""UPDATE data SET
fingerprint = ?,
sharpness = ?,
R = ?,
G = ?,
B = ?,
BR = ?,
BG = ?,
BB = ?
WHERE hash = ?
""",
(
image.fingerprint,
image.sharpness,
image.colors["R"],
image.colors["G"],
image.colors["B"],
image.colors["BR"],
image.colors["BG"],
image.colors["BB"],
image.hash,
),
)
if i % 50 == 0:
self.db.conn.commit()
self.db.conn.commit()
return
def disk_used(self):
if self.options.diskused_depth is None:
self.options.diskused_depth = 9999999999
searchpath = os.path.realpath(self.options.path)
# self.options.diskused_depth += 1
result = self.db.cursor().execute(
"SELECT size, REPLACE(file,?,'') as path FROM list WHERE file LIKE ?",
(
searchpath + "/",
searchpath + "%",
),
)
entries = []
sizes = []
for row in result:
start_path = row[1].split("/")
start_path = "/".join(start_path[0 : int(self.options.diskused_depth)])
if len(start_path) != len(row[1]):
start_path += "/"
if start_path not in entries:
entries.append(start_path)
sizes.append(row[0])
else:
sizes[entries.index(start_path)] += row[0]
for entry in zip(sizes, entries):
print("| ".join([str(entry[0]).ljust(14), humanize_size(entry[0]).rjust(8), entry[1]]))
def duplicates(self):
result = self.db.cursor().execute(
"""
WITH
duplicates AS (SELECT hash FROM list GROUP BY hash HAVING count(hash) > 1),
f AS (SELECT
list.hash,list.file FROM list
LEFT JOIN duplicates ON (list.hash = duplicates.hash)
WHERE duplicates.hash IS NOT NULL
ORDER BY file
)
SELECT
CAST((row_number() OVER (PARTITION BY f.hash))-1 AS TEXT) AS row,
RELATIVE(file)
FROM f
""",
)
for row in result:
c = "=" if row[0] == "0" else ">"
print(c + "|".join(row))
def nearestcolor(self):
"""Find closest matching images to given RGB color"""
src = self.options.nearestcolor
try:
src = [int(i) for i in src.strip().strip('"').split(",")]
if len(src) == 3:
src.append(1)
f = ""
except ValueError:
src = src.strip().split(",")
if len(src) == 1:
limit = 1
else:
limit = int(src[1])
image = ImageMeasure(src[0])
colors = image.get_colors()
f = os.path.realpath(src[0])
src = (colors["R"], colors["G"], colors["B"], limit)
self.db.conn.create_function("SQUARE", 1, sqlite_square)
self.db.conn.create_function("SQRT", 1, sqlite_sqrt)
result = self.db.cursor().execute(
"""
WITH distances AS (
SELECT hash, ROUND(SQRT(SQUARE(BR-?)+SQUARE(BG-?)+SQUARE(BB-?)),1) as distance,BR,BG,BB FROM data ORDER BY distance LIMIT ?
)
SELECT
RELATIVE(list.file),
distances.distance,
distances.BR,
distances.BG,
distances.BB
FROM list
LEFT JOIN
distances ON (distances.hash = list.hash)
WHERE distances.hash IS NOT NULL AND list.file != ?
ORDER BY distances.distance
""",
(src[0], src[1], src[2], src[3], f),
)
print("|".join(("Path", "Dist", "BR", "BG", "BB")))
for hit in result:
p, d, r, g, b = hit
print(
"|".join(
(
p,
str(d),
str(int(r)),
str(int(g)),
str(int(b)),
)
)
)
def similarity(self):
def set_image(row):
image = ImageMeasure(None)
image.hash = row[0]
image.fingerprint = row[1]
image.sharpness = row[2]
image.width = row[3]
image.height = row[4]
image.colors["R"] = row[5]
image.colors["G"] = row[6]
image.colors["B"] = row[7]
return image
def get_matching(cmp_image):
compare_list = self.db.cursor().execute(
"""SELECT hash,fingerprint,sharpness,width,height,R,G,B
FROM data
WHERE fingerprint IS NOT NULL AND sharpness > 0 AND hash != ?""",
(cmp_image.hash,),
)
match_list = []
for row2 in compare_list:
other_image = set_image(row2)
similarity = cmp_image.similarity_difference(other_image)
if similarity <= thr:
other_image.similarity["distance"] = similarity
other_image.similarity["color"] = cmp_image.color_difference(other_image)
other_image.similarity["aspect"] = cmp_image.shape_difference(other_image)
other_image.filename = self.db.hash2file(other_image.hash)
match_list.append(other_image)
return match_list
def get_duplicates():
return self.db.cursor().execute(
"""
WITH
duplicates AS (SELECT fingerprint FROM data GROUP BY fingerprint HAVING count(fingerprint) > 1 AND sharpness > 0),
duphash AS (
SELECT duplicates.fingerprint, data.hash, data.sharpness, data.width, data.height, data.R, data.G, data.B
FROM duplicates
LEFT JOIN data ON (duplicates.fingerprint = data.fingerprint)
),
f AS (SELECT
duphash.fingerprint, duphash.hash,list.file,
duphash.sharpness,
duphash.width, duphash.height,
duphash.R, duphash.G, duphash.B
FROM duphash
LEFT JOIN list ON (list.hash = duphash.hash)
WHERE list.file IS NOT NULL
ORDER BY list.file
)
SELECT
CAST((row_number() OVER (PARTITION BY f.fingerprint))-1 AS TEXT) AS row,
file,
hash,
fingerprint,
sharpness,width,height,R,G,B
FROM f
""",
)
def print_matching(match_list, cmp_image):
if len(match_list) > 0:
match_list.sort(key=lambda i: i.similarity["distance"])
print_similarity_row(cmp_image, "=", 0)
for i, img in enumerate(match_list):
print_similarity_row(img, ">", i + 1)
def print_similarity_row(img, c, index):
fnames = ", ".join([self.db.file2relative(f) for f in img.filename])
print(
f"{c}{index}|{fnames}|{img.similarity['distance']}|{img.similarity['color']}|{img.similarity['aspect']}|{img.sharpness}|{img.width}|{img.height}"
)
print("|".join(("#", "File", "SD", "CD", "RD", "Shp", "W", "H")))
if self.options.similarity:
try:
thr = int(self.options.similarity)
file = None
except ValueError:
file = self.options.similarity.split(",")
if len(file) == 1:
thr = 20
else:
thr = int(file[1])
file = file[0]
if file is None:
# Measure similarity on all files
fingerprint_list = self.db.cursor().execute(
"""
SELECT hash,fingerprint,sharpness,width,height,R,G,B
FROM data
WHERE fingerprint IS NOT NULL
AND sharpness > 0"""
)
checked = set()
for i, row in enumerate(fingerprint_list):
if row[0] in checked:
continue
cmp_image = set_image(row)
cmp_image.filename = self.db.hash2file(cmp_image.hash)
match_list = get_matching(cmp_image)
for m in match_list:
checked.add(m.hash)
print_matching(match_list, cmp_image)
else:
# Read single image, and find similarty to that
cmp_image = ImageMeasure(file)
cmp_image.set_all()
cmp_image.filename = [cmp_image.filename]
match_list = get_matching(cmp_image)
print_matching(match_list, cmp_image)
if self.options.visual_duplicate:
match_list = []
for row in get_duplicates():
if row[0] == "0":
if len(match_list) > 0:
print_matching(match_list, cmp_image)
cmp_image = set_image(row[2:])
cmp_image.filename = [row[1]]
match_list = []
else:
other_image = set_image(row[2:])
other_image.filename = [row[1]]
other_image.similarity["color"] = cmp_image.color_difference(other_image)
other_image.similarity["aspect"] = cmp_image.shape_difference(other_image)
match_list.append(other_image)
print_matching(match_list, cmp_image)
def tag_manage(self):
fname = os.path.realpath(self.options.file)
hash = self.db.file2hash(fname)
if hash is None:
raise Exception(f"{fname} not in database")
for add in self.options.add_tag:
self.db.cursor().execute(
"INSERT INTO tags(hash,tag) VALUES (?,?)",
(hash, add),
)
for rm in self.options.delete_tag:
self.db.cursor().execute(
"DELETE FROM tags WHERE hash = ? AND tag = ?",
(hash, rm),
)
if len(self.options.add_tag) + len(self.options.delete_tag) > 0:
self.db.conn.commit()
tags = self.db.cursor().execute(
"""
SELECT
tags.tag
FROM tags
WHERE tags.hash = ?
""",
(hash,),
)
print(",".join([x[0] for x in tags]))
def clean_dirs(dirs):
"""Remove in place, because os.walk uses the same variable"""
remove = []
for i, s in enumerate(dirs):
if (s in BADDIRS) or s.startswith("."):
remove.append(i)
for r in sorted(remove, reverse=True):
del dirs[r]
def clean_syms(files):
return [f for f in files if not os.path.islink(f)]
def humanize_size(size, precision=1):
if size == None:
return "nan"
suffixes = ["B", "KB", "MB", "GB", "TB", "PB", "EB"]
suffixIndex = 0
defPrecision = 0
while size > 1024:
suffixIndex += 1
size = size / 1024.0
defPrecision = precision
return "%.*f%s" % (defPrecision, size, suffixes[suffixIndex])
def humanize_date(date):
if date == None:
return ""
return datetime.fromtimestamp(int(date)).strftime("%Y-%m-%d %H:%M:%S")
def setup_options():
parser = ArgumentParser(description="Maintains a list of images sqlite file")
parser.add_argument(
"-f",
action="store",
dest="sqlfile",
default=SQLFILE,
help="SQL file name to use [%(default)s]",
)
subparsers = parser.add_subparsers(title="Command", dest="command")
help = subparsers.add_parser("help", help="Help on all commands")
db = subparsers.add_parser("db", help="Update database")
search = subparsers.add_parser("search", help="Search similarity")
du = subparsers.add_parser("du", help="Disk usage")
tag = subparsers.add_parser("tag", help="Tag manager")
db.add_argument(
"--no-add",
"-a",
action="store_true",
dest="no_add",
default=False,
help="Do not add new files [%(default)s]",
)
db.add_argument(
"--measure",
"-m",
action="store_true",
dest="measure",
default=False,
help="Measure various statistics for similarity/color searches. [%(default)s]",
)
db.add_argument(
"--changed",
"-c",
action="store_true",
dest="changed",
default=False,
help="Search for changed files and update their entries [%(default)s]",
)
db.add_argument(
"--no-delete",
"-d",
action="store_true",
dest="no_delete",
default=False,
help="Do not delete non-existing entries [%(default)s]",
)
db.add_argument(
"--no-delete-data",
"-D",
action="store_true",
dest="no_delete_data",
default=False,
help="Do not delete unused metadata [%(default)s]",
)
db.add_argument(
"-x",
action="append",
dest="exclude",
default=[],
help="Exclude folder name. This option may be issued several times.",
)
db.add_argument(
"-l",
action="store_true",
dest="symlinks",
default=False,
help="Follow symbolic links [%(default)s]",
)
db.add_argument("startpath", action="store", default=".", nargs="?", help="Path to start scanning for images.")
du.add_argument(
"-d",
type=int,
action="store",
dest="diskused_depth",
default=None,
help="Depth of summarization for du.",
)
du.add_argument(
type=str,
action="store",
dest="path",
default=".",
help="Print directory sizes. Argument is the path where directories are listed from.",
nargs="?",
)
search.add_argument(
"--dup",
action="store_true",
dest="duplicate",
default=False,
help="Return a list of duplicate files, based on file hashes. [%(default)s]",
)
search.add_argument(
"--visdup",
action="store_true",
dest="visual_duplicate",
default=False,
help="Return a list of visually exact duplicate files, based on perceptual hashes. [%(default)s]",
)
search.add_argument(
"--color",
type=str,
dest="nearestcolor",
default=False,
help="Search list for nearest ambient color. format: R,G,B in uint8. Add fourth value to limit search to number of hits. Also accepts format file,hits to find nearest color to given file.",
)
search.add_argument(
"--similar",
type=str,
dest="similarity",
default=None,
help="Search list for similar images. Value 0-255 for similarity threshold. 0=high similarity. "
+ "If value is a filename, search similar to that image. "
+ "Append with ',value' to limit similarity. default to 20."
+ "The output columns: SD SimilarityDiff., CD ColorDiff., "
+ "RD AspectRatioDiff.,Shp SharpnessIndex. This function does not return exact duplicates.",
)
tag.add_argument(
"-t",
action="append",
dest="add_tag",
default=[],
help="Give file a tag.",
)
tag.add_argument(
"-d",
action="append",
dest="delete_tag",
default=[],
help="Delete a tag.",
)
tag.add_argument(
type=str,
dest="file",
default=None,
help="File name for tagging.",
)
options = parser.parse_args()
if options.command == "help":
parser.print_help()
print("\n====\nCommand: db")
db.print_help()
print("\n====\nCommand: search")
search.print_help()
print("\n====\nCommand: du")
du.print_help()
print("\n====\nCommand: tag")
tag.print_help()
sys.exit(0)
if options.command == None:
parser.print_help()
sys.exit(0)
# options = db.parse_args()
# options.command = "db"
# options.sqlfile = SQLFILE
if options.command == "db":
BADDIRS.extend(options.exclude)
return options
def main():
options = setup_options()
il = ImageList(options)
if options.command == "db":
if not options.no_delete:
il.delete_missing()
if not options.no_add:
il.recursive_add()
il.base_add()
if not options.no_delete_data:
il.clean_data()
if options.measure:
il.base_add()
il.measure()
if options.command == "du":
il.disk_used()
if options.command == "search":
if options.duplicate:
il.duplicates()
if options.visual_duplicate:
il.similarity()
if options.nearestcolor:
il.nearestcolor()
if options.similarity:
il.similarity()
if options.command == "tag":
il.tag_manage()
print("")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,129 @@
import os
import sqlite3
from math import sqrt as sqlite_sqrt
class DB:
def __init__(self, sqlfile):
self.sqlfile = sqlfile
self.root_path = os.path.dirname(os.path.realpath(sqlfile))
self.create_db()
self.connect()
def create_db(self):
if os.path.exists(self.sqlfile):
return
conn = sqlite3.connect(self.sqlfile)
db = conn.cursor()
conn.text_factory = str
db.execute(
"""CREATE TABLE data (
hash TEXT PRIMARY KEY,
description TEXT,
portrait BOOLEAN,
width INTEGER,
height INTEGER,
fingerprint TEXT,
sharpness NUMERIC,
R REAL, G REAL, B REAL, BR REAL, BG REAL, BB REAL
)"""
)
db.execute("CREATE TABLE list (file TEXT PRIMARY KEY,hash TEXT,date INTEGER,size INTEGER)")
db.execute("CREATE TABLE tags (hash TEXT,tag TEXT)")
db.execute(
"""CREATE VIEW files AS
SELECT list.file, list.date, list.size, data.*
FROM list
LEFT JOIN data ON data.hash = list.hash"""
)
db.execute("CREATE UNIQUE INDEX data_hash ON data(hash)")
db.execute("CREATE UNIQUE INDEX list_file ON list(file)")
conn.commit()
return
def connect(self):
conn = sqlite3.connect(self.sqlfile)
conn.text_factory = str
conn.create_function("RELATIVE", 1, self.file2relative)
self.conn = conn
return conn
def cursor(self):
return self.conn.cursor()
def get_folder_contents(self, path):
"""return the contents of the folder"""
files = []
res = self.cursor().execute("SELECT file FROM list where file LIKE ?", (f"{path}%",))
for row in res:
base = row[0].replace(path, "", 1)
if not "/" in base:
files.append(row[0])
return files
def is_time_mismatch(self, image):
count = (
self.cursor()
.execute(
"SELECT COUNT(1) FROM list WHERE file = ? AND date = ?",
(
image.filename,
image.get_time(),
),
)
.fetchall()[0][0]
)
return count == 0
def is_hash_mismatch(self, image):
count = (
self.cursor()
.execute(
"SELECT COUNT(1) FROM list WHERE file = ? AND hash = ?",
(
image.filename,
image.get_hash(),
),
)
.fetchall()[0][0]
)
return count == 0
def hash2file(self, hash):
return [
row[0]
for row in self.cursor()
.execute(
"SELECT file FROM LIST WHERE hash = ?",
(hash,),
)
.fetchall()
]
def file2hash(self, file):
try:
return [
row[0]
for row in self.cursor()
.execute(
"SELECT hash FROM LIST WHERE file = ?",
(file,),
)
.fetchall()
][0]
except Exception:
return None
def file2relative(self, file):
return os.path.relpath(file, self.root_path)
def sqlite_square(x):
return x * x

View File

@@ -0,0 +1,229 @@
import hashlib
import os
import sys
import imagehash
import numpy as np
from PIL import Image, ImageFilter
try:
from turbojpeg import TJPF_RGB, TurboJPEG
JPEG = TurboJPEG()
except Exception:
JPEG = None
pass
LaplaceX = ImageFilter.Kernel(size=(3, 3), kernel=(0, 0, 0, 1, -2, 1, 0, 0, 0), scale=1, offset=0)
LaplaceY = ImageFilter.Kernel(size=(3, 3), kernel=(0, 1, 0, 0, -2, 0, 0, 1, 0), scale=1, offset=0)
Border = np.zeros((10, 10), dtype=bool)
Border[0, :] = True
# Border[9, :] = True
Border[:, 0] = True
Border[:, 9] = True
class ImageMeasure:
def __init__(self, filename):
self.filename = filename
self.hash = None
self.time = None
self.size = None
self.description = None
self.width = None
self.height = None
self.portrait = None
self.fingerprint = None
self.sharpness = None
self.colors = {x: None for x in ("R", "G", "B", "BR", "BG", "BB")}
self.similarity = {"distance": 0, "color": 0, "aspect": 0}
self.tags = []
self.image = None
def __str__(self):
printable = []
for k, v in self.__dict__.items():
if k == "image":
if not self.image is None:
v = "Loaded..."
printable.append(f"{k}: {v}")
return "\n".join(printable)
def set_all(self):
self.set_filename_absolute()
self.get_hash()
self.get_time()
self.get_size()
self.get_shape()
self.get_fingerprint()
self.get_sharpness()
self.get_colors()
def set_filename_absolute(self):
self.filename = os.path.realpath(self.filename)
def get_hash(self):
"""Return hash of the file"""
if self.hash is None:
hasher = hashlib.sha1()
blk = 2**16
with open(self.filename, "rb") as f:
while True:
d = f.read(blk)
if not d:
break
hasher.update(d)
self.hash = hasher.hexdigest()
return self.hash
def get_time(self):
"""Return mtime of the file"""
if self.time is None:
self.time = int(os.path.getmtime(self.filename))
return self.time
def get_size(self):
if self.size is None:
self.size = os.path.getsize(self.filename)
return self.size
def get_width(self):
if self.width is None:
self.width, _, _ = self.get_shape()
return self.width
def get_height(self):
if self.height is None:
_, self.height, _ = self.get_shape()
return self.height
def get_portrait(self):
if self.portrait is None:
_, _, self.portrait = self.get_shape()
return self.portrait
def get_shape(self):
if self.width is None or self.height is None or self.portrait is None:
# self.height, self.width = self.get_image("numpy").shape[0:2]
self.width, self.height = read_image_size(self.filename)
self.portrait = self.height >= self.width
return self.width, self.height, self.portrait
def get_image(self, image_type="numpy"):
if self.image is None:
self.image, self.image_type = read_image(self.filename)
if self.image_type == "numpy":
if len(self.image.shape) > 2:
# BGR -> RGB
self.image = np.flip(self.image, axis=2)
if self.image_type == image_type:
return self.image
if image_type == "numpy":
return np.array(self.image)
if image_type == "PIL":
return Image.fromarray(self.image)
def get_fingerprint(self):
if self.fingerprint is None:
# self.fingerprint = str(imagehash.phash(self.get_image("PIL"), hash_size=8))
self.fingerprint = str(imagehash.dhash(self.get_image("PIL"), hash_size=8))
return self.fingerprint
def get_sharpness(self):
if self.sharpness is None:
try:
im = self.get_image("PIL").convert("L")
crop_box = (1, 1, im.width - 1, im.height - 1)
self.sharpness = round(
(
np.sum(np.abs(np.array(im.filter(LaplaceX).crop(crop_box)).astype(float)))
+ np.sum(np.abs(np.array(im.filter(LaplaceY).crop(crop_box)).astype(float)))
)
/ (2 * im.width * im.height),
4,
)
except Exception:
self.sharpness = 0
return self.sharpness
def get_colors(self):
def get_border(im):
return int(np.mean(im[Border]))
if self.colors["R"] is None:
im = self.get_image("PIL").convert("RGB")
th = im.copy()
th.thumbnail((1, 1), resample=Image.BILINEAR)
th = np.array(th)
im = np.array(im.resize((10, 10), resample=Image.BILINEAR))
self.colors["R"] = int(th[0][0][0])
self.colors["G"] = int(th[0][0][1])
self.colors["B"] = int(th[0][0][2])
self.colors["BR"] = get_border(im[:, :, 0])
self.colors["BG"] = get_border(im[:, :, 1])
self.colors["BB"] = get_border(im[:, :, 2])
return self.colors
def similarity_difference(self, other):
other_phash = imagehash.hex_to_hash(other.get_fingerprint())
this_phash = imagehash.hex_to_hash(self.get_fingerprint())
return other_phash - this_phash
def color_difference(self, other):
other_color = other.get_colors()
this_color = self.get_colors()
diff = round(
np.sqrt(
np.square(other_color["R"] - this_color["R"])
+ np.square(other_color["G"] - this_color["G"])
+ np.square(other_color["B"] - this_color["B"])
),
1,
)
return diff
def shape_difference(self, other):
return round(abs(float(other.width) / float(other.height) - float(self.width) / float(self.height)), 4)
EXTENSIONS = (".jpg", ".png", ".tif", ".gif", ".jpeg", ".tiff")
JPEG_EXTENSIONS = (".jpg", ".jpeg")
def is_image_extension(f):
return os.path.splitext(f.lower())[1] in EXTENSIONS
def is_jpeg(f):
return os.path.splitext(f.lower())[1] in JPEG_EXTENSIONS
def read_image(fname):
if is_jpeg(fname):
if JPEG:
try:
with open(fname, "rb") as fp:
return Image.fromarray(JPEG.decode(fp.read(), pixel_format=TJPF_RGB)), "PIL"
except Exception as e:
pass
# Do not return inside with:
im = Image.open(fname)
return im, "PIL"
def read_image_size(fname):
"""Just reading the size is faster with PIL"""
im = Image.open(fname)
return im.width, im.height

View File

@@ -0,0 +1,26 @@
import os
from distutils.core import setup
def version_reader(path):
for line in open(path, "rt").read(1024).split("\n"):
if line.startswith("__version__"):
return line.split("=")[1].strip().replace('"', "")
version = version_reader(os.path.join("imagelist2", "__init__.py"))
setup(
name="imagelist2",
packages=["imagelist2"],
version=version,
description="Maintains a list of images sqlite file",
author="Ville R",
author_email="q@six9.net",
keywords=["images"],
entry_points={
"console_scripts": [
"image-list = imagelist2:main",
]
},
install_requires=["PyTurboJPEG", "Pillow", "ImageHash", "numpy", "tqdm"],
)