Renamed --shown-db to --reported-db, added "diff_tier" concept

mk-fg · Jul 10, 2012 · 13a0fdb · 13a0fdb
1 parent 8257831
commit 13a0fdb
Show file tree

Hide file tree

Showing 2 changed files with 63 additions and 34 deletions.
diff --git a/README.md b/README.md
@@ -37,8 +37,8 @@ Just run as `./image_matcher.py --feh ~/media/images`.
 
 	% ./image_matcher.py -h
 
-	usage: image_matcher.py [-h] [--hash-db PATH] [-p THREADS] [-n COUNT] [--feh]
-	                        [--feh-args CMDLINE] [--debug]
+	usage: image_matcher.py [-h] [--hash-db PATH] [-d [PATH]] [-p THREADS]
+	                        [-n COUNT] [--feh] [--feh-args CMDLINE] [--debug]
 	                        paths [paths ...]
 
 	positional arguments:
@@ -48,6 +48,10 @@ Just run as `./image_matcher.py --feh ~/media/images`.
 	  -h, --help            show this help message and exit
 	  --hash-db PATH        Path to db to store hashes in (default:
 	                        ./image_matcher.db).
+	  -d [PATH], --reported-db [PATH]
+	                        Record already-displayed pairs in a specified file and
+	                        dont show these again. Can be specified without
+	                        parameter to use "reported.db" file in the current dir.
 	  -p THREADS, --parallel THREADS
 	                        How many hashing ops can be done in parallel (default:
 	                        try cpu_count() or 1).
@@ -58,22 +62,25 @@ Just run as `./image_matcher.py --feh ~/media/images`.
 	                        defined (see --feh-args).
 	  --feh-args CMDLINE    Feh commandline parameters (space-separated, unless
 	                        quoted with ") before two image paths (default: -GNFY
-	                        --info "echo '%f %wx%h (diff: {diff})'" --action8 "rm
-	                        %f" --action1 "kill -INT {pid}", only used with --feh,
-	                        python-format keywords available: path1, path2, pid,
-	                        diff)
+	                        --info "echo '%f %wx%h (diff: {diff}, {diff_n} /
+	                        {diff_count})'" --action8 "rm %f" --action1 "kill -INT
+	                        {pid}", only used with --feh, python-format keywords
+	                        available: path1, path2, n, pid, diff, diff_n,
+	                        diff_count)
 	  --debug               Verbose operation mode.
 
 feh can be customized to do any action or show any kind of info alongside images
 with --feh-args parameter. It's also possible to make it show images
 side-by-side in montage mode or in separate windows in multiwindow mode, see
 "man feh" for details.
 
-Default line (`feh -GNFY --info "echo '%f %wx%h (diff: {diff})'" --action8 "rm
-%f" --action1 "kill -INT {pid}" {path1} {path2}`) makes it show fullscreen
-image, some basic info (along with difference between image hashes) about it and
-action reference, pressing "8" there will remove currently displayed version,
-"1" will stop the comparison and quitting feh ("q") will go to the next pair.
+Default line (`feh -GNFY --info "echo '%f %wx%h (diff: {diff}, {diff_n} /
+{diff_count})'" --action8 "rm %f" --action1 "kill -INT {pid}" {path1} {path2}`)
+makes it show fullscreen image, some basic info (along with difference between
+image hashes and how much images there are with the same level of difference)
+about it and action reference, pressing "8" there will remove currently
+displayed version, "1" will stop the comparison and quitting feh ("q") will go
+to the next pair.
 
 Without --feh (non-interactive / non-gui mode), tool outputs pairs of images and
 the [Hamming distance](https://en.wikipedia.org/wiki/Hamming_distance) value for
@@ -83,6 +90,11 @@ two).
 Output is sorted by this "distance" value, so most similar images (with the
 lowest number) should come first (see --top-n parameter).
 
+Optional --reported-db (or "-d") parameter allows efficient skipping of
+already-reported "similar" image pairs by recording these in a dbm file.
+Intended usage for this option is to skip repeating same hash-similar pairs on
+repeated runs, reporting similarity for new images instead.
+
 
 Operation
 --------------------
@@ -101,7 +113,8 @@ Script does these steps, in order:
   combinations, sorting the results ("sort_by_similarity" function).
 
 * Print (or run "feh" on) each found image-pair ("print(path1, path2, d)" line),
-  in most-similar-first order.
+  in most-similar-first order, optionally skipping pairs matching those in
+  --reported-db file.
 
 It's fairly simple, really, all the magic and awesomeness is in calculation of
 that "perceptual hash" values, and is abstracted by

diff --git a/image_matcher.py b/image_matcher.py
@@ -92,11 +92,11 @@ def main():
 	parser.add_argument('--hash-db', metavar='PATH',
 		default='{}.db'.format(os.path.splitext(sys.argv[0])[0]),
 		help='Path to db to store hashes in (default: %(default)s).')
-	parser.add_argument('-d', '--shown-db',
+	parser.add_argument('-d', '--reported-db',
 		nargs='?', metavar='PATH', default=False,
 		help='Record already-displayed pairs in'
 				' a specified file and dont show these again.'
-			' Can be specified without parameter to use "shown.db" file in the current dir.')
+			' Can be specified without parameter to use "reported.db" file in the current dir.')
 	parser.add_argument('-p', '--parallel', type=int, metavar='THREADS',
 		help='How many hashing ops'
 			' can be done in parallel (default: try cpu_count() or 1).')
@@ -106,11 +106,13 @@ def main():
 		help='Run feh for each image match with'
 			' removal actions defined (see --feh-args).')
 	parser.add_argument('--feh-args', metavar='CMDLINE',
-		default=( '-GNFY --info "echo \'%f %wx%h (diff: {diff})\'"'
+		default=( '-GNFY --info "echo \'%f %wx%h'
+			' (diff: {diff}, {diff_n} / {diff_count})\'"'
 			' --action8 "rm %f" --action1 "kill -INT {pid}"' ),
 		help='Feh commandline parameters (space-separated,'
 			' unless quoted with ") before two image paths (default: %(default)s,'
-			' only used with --feh, python-format keywords available: path1, path2, pid, diff)')
+			' only used with --feh, python-format keywords available:'
+			' path1, path2, n, pid, diff, diff_n, diff_count)')
 	parser.add_argument('--debug',
 		action='store_true', help='Verbose operation mode.')
 	optz = parser.parse_args()
@@ -151,29 +153,43 @@ def quote_split(arg_line):
 				threads=optz.parallel if optz.parallel > 1 else False )
 		finally: pickle.dump(dcts, open(optz.hash_db, 'wb'))
 
-		if optz.shown_db is not False:
+		if optz.reported_db is not False:
 			import shelve
-			optz.shown_db = shelve.open(optz.shown_db or 'shown.db', 'c')
-		else: optz.shown_db = None
+			optz.reported_db = shelve.open(optz.reported_db or 'reported.db', 'c')
+			log.debug('Cleaning up reported_db of non-existent paths')
+			for paths_key in optz.reported_db.keys():
+				path1, path2 = paths_key.split('\0')
+				if not all(it.imap(os.path.exists, [path1, path2])):
+					del optz.reported_db[paths_key]
+		else: optz.reported_db = None
 
 		if optz.top_n != 0:
-			for i, (d, path1, path2) in enumerate(sort_by_similarity(dcts)):
-				if optz.shown_db is not None:
-					paths_key = '{}\0{}'.format(*sorted([path1, path2]))
-					if paths_key in optz.shown_db: continue
-				print(path1, path2, d)
-				if optz.feh and all(it.imap(os.path.exists, [path1, path2])):
-					cmd = ['feh'] + list(arg.format( path1=path1, path2=path2,
-						pid=os.getpid(), diff=d ) for arg in optz.feh_args) + [path1, path2]
-					log.debug('Feh command: {}'.format(cmd))
-					Popen(cmd).wait()
-				if optz.shown_db is not None\
-						and all(it.imap(os.path.exists, [path1, path2])):
-					optz.shown_db[paths_key] = True
-				if optz.top_n is not None and i >= optz.top_n: break
+			n, pid = 0, os.getpid()
+			for d, diff_tier in it.groupby(sort_by_similarity(dcts), key=op.itemgetter(0)):
+				diff_tier = list(diff_tier)
+				diff_count = len(diff_tier)
+				for diff_n, (d, path1, path2) in enumerate(diff_tier):
+					n += 1
+					if optz.reported_db is not None:
+						paths_key = '{}\0{}'.format(*sorted([path1, path2]))
+						if paths_key in optz.reported_db:
+							log.debug('Skipped path-pair due to reported_db: {} {}'.format(path1, path2))
+							continue
+					print(path1, path2, d)
+					if optz.feh and all(it.imap(os.path.exists, [path1, path2])):
+						cmd = ['feh'] + list(
+							arg.format( path1=path1, path2=path2, pid=pid,
+								diff=d, n=n+1, diff_n=diff_n+1, diff_count=diff_count )
+							for arg in optz.feh_args ) + [path1, path2]
+						log.debug('Feh command: {}'.format(cmd))
+						Popen(cmd).wait()
+					if optz.reported_db is not None\
+							and all(it.imap(os.path.exists, [path1, path2])):
+						optz.reported_db[paths_key] = True
+					if optz.top_n is not None and n >= optz.top_n: break
 	except KeyboardInterrupt: sys.exit(0)
 	finally:
-		if optz.shown_db is not None: optz.shown_db.sync()
+		if optz.reported_db is not None: optz.reported_db.sync()
 
 
 if __name__ == '__main__': main()