Skip to content

Commit

Permalink
Renamed --shown-db to --reported-db, added "diff_tier" concept
Browse files Browse the repository at this point in the history
  • Loading branch information
mk-fg committed Jul 10, 2012
1 parent 8257831 commit 13a0fdb
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 34 deletions.
37 changes: 25 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ Just run as `./image_matcher.py --feh ~/media/images`.

% ./image_matcher.py -h

usage: image_matcher.py [-h] [--hash-db PATH] [-p THREADS] [-n COUNT] [--feh]
[--feh-args CMDLINE] [--debug]
usage: image_matcher.py [-h] [--hash-db PATH] [-d [PATH]] [-p THREADS]
[-n COUNT] [--feh] [--feh-args CMDLINE] [--debug]
paths [paths ...]

positional arguments:
Expand All @@ -48,6 +48,10 @@ Just run as `./image_matcher.py --feh ~/media/images`.
-h, --help show this help message and exit
--hash-db PATH Path to db to store hashes in (default:
./image_matcher.db).
-d [PATH], --reported-db [PATH]
Record already-displayed pairs in a specified file and
dont show these again. Can be specified without
parameter to use "reported.db" file in the current dir.
-p THREADS, --parallel THREADS
How many hashing ops can be done in parallel (default:
try cpu_count() or 1).
Expand All @@ -58,22 +62,25 @@ Just run as `./image_matcher.py --feh ~/media/images`.
defined (see --feh-args).
--feh-args CMDLINE Feh commandline parameters (space-separated, unless
quoted with ") before two image paths (default: -GNFY
--info "echo '%f %wx%h (diff: {diff})'" --action8 "rm
%f" --action1 "kill -INT {pid}", only used with --feh,
python-format keywords available: path1, path2, pid,
diff)
--info "echo '%f %wx%h (diff: {diff}, {diff_n} /
{diff_count})'" --action8 "rm %f" --action1 "kill -INT
{pid}", only used with --feh, python-format keywords
available: path1, path2, n, pid, diff, diff_n,
diff_count)
--debug Verbose operation mode.

feh can be customized to do any action or show any kind of info alongside images
with --feh-args parameter. It's also possible to make it show images
side-by-side in montage mode or in separate windows in multiwindow mode, see
"man feh" for details.

Default line (`feh -GNFY --info "echo '%f %wx%h (diff: {diff})'" --action8 "rm
%f" --action1 "kill -INT {pid}" {path1} {path2}`) makes it show fullscreen
image, some basic info (along with difference between image hashes) about it and
action reference, pressing "8" there will remove currently displayed version,
"1" will stop the comparison and quitting feh ("q") will go to the next pair.
Default line (`feh -GNFY --info "echo '%f %wx%h (diff: {diff}, {diff_n} /
{diff_count})'" --action8 "rm %f" --action1 "kill -INT {pid}" {path1} {path2}`)
makes it show fullscreen image, some basic info (along with difference between
image hashes and how much images there are with the same level of difference)
about it and action reference, pressing "8" there will remove currently
displayed version, "1" will stop the comparison and quitting feh ("q") will go
to the next pair.

Without --feh (non-interactive / non-gui mode), tool outputs pairs of images and
the [Hamming distance](https://en.wikipedia.org/wiki/Hamming_distance) value for
Expand All @@ -83,6 +90,11 @@ two).
Output is sorted by this "distance" value, so most similar images (with the
lowest number) should come first (see --top-n parameter).

Optional --reported-db (or "-d") parameter allows efficient skipping of
already-reported "similar" image pairs by recording these in a dbm file.
Intended usage for this option is to skip repeating same hash-similar pairs on
repeated runs, reporting similarity for new images instead.


Operation
--------------------
Expand All @@ -101,7 +113,8 @@ Script does these steps, in order:
combinations, sorting the results ("sort_by_similarity" function).

* Print (or run "feh" on) each found image-pair ("print(path1, path2, d)" line),
in most-similar-first order.
in most-similar-first order, optionally skipping pairs matching those in
--reported-db file.

It's fairly simple, really, all the magic and awesomeness is in calculation of
that "perceptual hash" values, and is abstracted by
Expand Down
60 changes: 38 additions & 22 deletions image_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,11 +92,11 @@ def main():
parser.add_argument('--hash-db', metavar='PATH',
default='{}.db'.format(os.path.splitext(sys.argv[0])[0]),
help='Path to db to store hashes in (default: %(default)s).')
parser.add_argument('-d', '--shown-db',
parser.add_argument('-d', '--reported-db',
nargs='?', metavar='PATH', default=False,
help='Record already-displayed pairs in'
' a specified file and dont show these again.'
' Can be specified without parameter to use "shown.db" file in the current dir.')
' Can be specified without parameter to use "reported.db" file in the current dir.')
parser.add_argument('-p', '--parallel', type=int, metavar='THREADS',
help='How many hashing ops'
' can be done in parallel (default: try cpu_count() or 1).')
Expand All @@ -106,11 +106,13 @@ def main():
help='Run feh for each image match with'
' removal actions defined (see --feh-args).')
parser.add_argument('--feh-args', metavar='CMDLINE',
default=( '-GNFY --info "echo \'%f %wx%h (diff: {diff})\'"'
default=( '-GNFY --info "echo \'%f %wx%h'
' (diff: {diff}, {diff_n} / {diff_count})\'"'
' --action8 "rm %f" --action1 "kill -INT {pid}"' ),
help='Feh commandline parameters (space-separated,'
' unless quoted with ") before two image paths (default: %(default)s,'
' only used with --feh, python-format keywords available: path1, path2, pid, diff)')
' only used with --feh, python-format keywords available:'
' path1, path2, n, pid, diff, diff_n, diff_count)')
parser.add_argument('--debug',
action='store_true', help='Verbose operation mode.')
optz = parser.parse_args()
Expand Down Expand Up @@ -151,29 +153,43 @@ def quote_split(arg_line):
threads=optz.parallel if optz.parallel > 1 else False )
finally: pickle.dump(dcts, open(optz.hash_db, 'wb'))

if optz.shown_db is not False:
if optz.reported_db is not False:
import shelve
optz.shown_db = shelve.open(optz.shown_db or 'shown.db', 'c')
else: optz.shown_db = None
optz.reported_db = shelve.open(optz.reported_db or 'reported.db', 'c')
log.debug('Cleaning up reported_db of non-existent paths')
for paths_key in optz.reported_db.keys():
path1, path2 = paths_key.split('\0')
if not all(it.imap(os.path.exists, [path1, path2])):
del optz.reported_db[paths_key]
else: optz.reported_db = None

if optz.top_n != 0:
for i, (d, path1, path2) in enumerate(sort_by_similarity(dcts)):
if optz.shown_db is not None:
paths_key = '{}\0{}'.format(*sorted([path1, path2]))
if paths_key in optz.shown_db: continue
print(path1, path2, d)
if optz.feh and all(it.imap(os.path.exists, [path1, path2])):
cmd = ['feh'] + list(arg.format( path1=path1, path2=path2,
pid=os.getpid(), diff=d ) for arg in optz.feh_args) + [path1, path2]
log.debug('Feh command: {}'.format(cmd))
Popen(cmd).wait()
if optz.shown_db is not None\
and all(it.imap(os.path.exists, [path1, path2])):
optz.shown_db[paths_key] = True
if optz.top_n is not None and i >= optz.top_n: break
n, pid = 0, os.getpid()
for d, diff_tier in it.groupby(sort_by_similarity(dcts), key=op.itemgetter(0)):
diff_tier = list(diff_tier)
diff_count = len(diff_tier)
for diff_n, (d, path1, path2) in enumerate(diff_tier):
n += 1
if optz.reported_db is not None:
paths_key = '{}\0{}'.format(*sorted([path1, path2]))
if paths_key in optz.reported_db:
log.debug('Skipped path-pair due to reported_db: {} {}'.format(path1, path2))
continue
print(path1, path2, d)
if optz.feh and all(it.imap(os.path.exists, [path1, path2])):
cmd = ['feh'] + list(
arg.format( path1=path1, path2=path2, pid=pid,
diff=d, n=n+1, diff_n=diff_n+1, diff_count=diff_count )
for arg in optz.feh_args ) + [path1, path2]
log.debug('Feh command: {}'.format(cmd))
Popen(cmd).wait()
if optz.reported_db is not None\
and all(it.imap(os.path.exists, [path1, path2])):
optz.reported_db[paths_key] = True
if optz.top_n is not None and n >= optz.top_n: break
except KeyboardInterrupt: sys.exit(0)
finally:
if optz.shown_db is not None: optz.shown_db.sync()
if optz.reported_db is not None: optz.reported_db.sync()


if __name__ == '__main__': main()

0 comments on commit 13a0fdb

Please sign in to comment.