From 302a628a761ca8d7a38b192df33519b317103732 Mon Sep 17 00:00:00 2001
From: Rob Swindell <rob@synchro.net>
Date: Sat, 24 Apr 2021 19:31:03 -0700
Subject: [PATCH] Support find/remove of duplicates based on archive contents

Duplicate archives (e.g. ZIP files) often have the different digest/hash/CRC
values because they have a different ZIP comment or the files were archived
in a different order or used different compression schemes. You can now detect
(and delete) these duplicates.
---
 exec/dupefind.js | 56 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 53 insertions(+), 3 deletions(-)

diff --git a/exec/dupefind.js b/exec/dupefind.js
index 051b908e26..453de7a74c 100755
--- a/exec/dupefind.js
+++ b/exec/dupefind.js
@@ -27,11 +27,15 @@ for(var i = 0; i < argc; i++) {
 			writeln("  -min=<bytes>    specify minimum file size to compare hash/sum");
 			writeln("  -ex=<filename>  add to excluded file name list (case-insensitive)");
 			writeln("  -names          search for duplicate file names (case-insensitive)");
+			writeln("  -arc            search for duplicate contents of archive files");
+			writeln("  -hash           hash each archived file contents for comparison");
 			writeln("  -sha1           search for duplicate SHA-1 sums (the default)");
 			writeln("  -crc32          search for duplicate CRC-32 sums");
 			writeln("  -md5            search for duplicate MD5 sums");
 			writeln("  -json[=space]   create JSON formatted output");
 			writeln("  -v              increase verbosity of JSON output");
+			writeln("  -sort           sort each directory file list");
+			writeln("  -reverse        reverse each directory file list");
 			writeln("  -dedupe         remove/delete duplicate files");
 			exit(0);
 		}
@@ -80,12 +84,13 @@ for(var i = 0; i < argc; i++) {
 if(dir_list.length < 1)
 	for(var dir in file_area.dir)
 		dir_list.push(dir);
-if(!options.names && !hash_type)
+if(!options.names && !options.arc && !hash_type)
 	hash_type = "sha1";
 
 log("Reading file areas...");
 var name = {};
 var hash = {};
+var arc = {};
 var total_files = 0;
 var total_bytes = 0;
 for(var i in dir_list) {
@@ -94,7 +99,9 @@ for(var i in dir_list) {
 	var base = new FileBase(dir_code);
 	if(!base.open())
 		throw new Error(base.last_error);
-	var list = base.get_list(detail, /* sort: */false);
+	var list = base.get_list(detail, options.sort);
+	if(options.reverse)
+		list.reverse();
 	for(var j = 0; j < list.length; j++) {
 		var file = list[j];
 		if(exclude.indexOf(file.name.toUpperCase()) >= 0)
@@ -112,6 +119,26 @@ for(var i in dir_list) {
 				hash[file[hash_type]] = [];
 			hash[file[hash_type]].push(file);
 		}
+		if(options.arc) {
+			var contents = undefined;
+			try {
+				var contents = new Archive(file.path).list(options.hash);
+				contents.sort(function(a,b) { if(a.name < b.name) return -1; return a.name > b.name; } );
+				for(var a = 0; a < contents.length; a++) {
+					delete contents[a].format;
+					delete contents[a].compression;
+					delete contents[a].mode;
+					if(options.hash)
+						delete contents[a].time;
+				}
+			} catch(e) { }
+			if(contents) {
+				var key = JSON.stringify(contents);
+				if(!arc[key])
+					arc[key] = [];
+				arc[key].push(file);
+			}
+		}
 		total_bytes += file.size;
 	}
 	base.close();
@@ -120,9 +147,10 @@ for(var i in dir_list) {
 
 log("Searching for duplicates in " + total_files + " files (" 
 	+ file_size_str(total_bytes, /* unit */1, /* precision */1) + " bytes) ...");
-var dupe = { name: [], hash: []};
+var dupe = { name: [], hash: [], arc: []};
 var name_bytes = 0;
 var hash_bytes = 0;
+var arc_bytes = 0;
 for(var n in name) {
 	var f = name[n];
 	if(f.length <= 1)
@@ -141,6 +169,15 @@ for(var n in hash) {
 		hash_bytes += f[i].size;
 }
 
+for(var n in arc) {
+	var f = arc[n];
+	if(f.length <= 1)
+		continue;
+	dupe.arc.push(f);
+	for(var i = 1; i < f.length; i++)
+		arc_bytes += f[i].size;
+}
+
 if(options.names) {
 	log(dupe.name.length + " duplicate file names (" + file_size_str(name_bytes,1 , 1) + " bytes)");
 	if(options.dedupe)
@@ -160,6 +197,15 @@ if(hash_type) {
 	else
 		print_list(dupe.hash, hash_type);
 }
+if(options.arc) {
+	log(dupe.arc.length + " duplicate archives (" + file_size_str(arc_bytes,1 , 1) + " bytes)");
+	if(options.dedupe)
+		writeln(remove_list(dupe.arc, "contents") + " files removed");
+	else if(options.json)
+		writeln(JSON.stringify(dupe.arc, null, json_space));
+	else
+		print_list(dupe.arc, "contents");
+}
 
 function print_list(list, key)
 {
@@ -167,6 +213,8 @@ function print_list(list, key)
 		var value = list[i][0][key];
 		if(key == 'crc32')
 			value = format("%08X", value);
+		else if(!value)
+			value = list[i][0].name;
 		writeln("Duplicate file " + key + " #" + (i + 1) + ": " + value);
 		for(var j = 0; j < list[i].length; j++) {
 			var file = list[i][j];
@@ -182,6 +230,8 @@ function remove_list(list, key)
 		var value = list[i][0][key];
 		if(key == 'crc32')
 			value = format("%08X", value);
+		else if(!value)
+			value = list[i][0].name;
 		writeln("Duplicates of file " + key + " #" + (i + 1) + ": " + value);
 		writeln("  Keeping " + list[i][0].path);
 		for(var j = 1; j < list[i].length; j++) {
-- 
GitLab