From 302a628a761ca8d7a38b192df33519b317103732 Mon Sep 17 00:00:00 2001 From: Rob Swindell <rob@synchro.net> Date: Sat, 24 Apr 2021 19:31:03 -0700 Subject: [PATCH] Support find/remove of duplicates based on archive contents Duplicate archives (e.g. ZIP files) often have the different digest/hash/CRC values because they have a different ZIP comment or the files were archived in a different order or used different compression schemes. You can now detect (and delete) these duplicates. --- exec/dupefind.js | 56 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 3 deletions(-) diff --git a/exec/dupefind.js b/exec/dupefind.js index 051b908e26..453de7a74c 100755 --- a/exec/dupefind.js +++ b/exec/dupefind.js @@ -27,11 +27,15 @@ for(var i = 0; i < argc; i++) { writeln(" -min=<bytes> specify minimum file size to compare hash/sum"); writeln(" -ex=<filename> add to excluded file name list (case-insensitive)"); writeln(" -names search for duplicate file names (case-insensitive)"); + writeln(" -arc search for duplicate contents of archive files"); + writeln(" -hash hash each archived file contents for comparison"); writeln(" -sha1 search for duplicate SHA-1 sums (the default)"); writeln(" -crc32 search for duplicate CRC-32 sums"); writeln(" -md5 search for duplicate MD5 sums"); writeln(" -json[=space] create JSON formatted output"); writeln(" -v increase verbosity of JSON output"); + writeln(" -sort sort each directory file list"); + writeln(" -reverse reverse each directory file list"); writeln(" -dedupe remove/delete duplicate files"); exit(0); } @@ -80,12 +84,13 @@ for(var i = 0; i < argc; i++) { if(dir_list.length < 1) for(var dir in file_area.dir) dir_list.push(dir); -if(!options.names && !hash_type) +if(!options.names && !options.arc && !hash_type) hash_type = "sha1"; log("Reading file areas..."); var name = {}; var hash = {}; +var arc = {}; var total_files = 0; var total_bytes = 0; for(var i in dir_list) { @@ -94,7 +99,9 @@ for(var i in dir_list) { var base = new FileBase(dir_code); if(!base.open()) throw new Error(base.last_error); - var list = base.get_list(detail, /* sort: */false); + var list = base.get_list(detail, options.sort); + if(options.reverse) + list.reverse(); for(var j = 0; j < list.length; j++) { var file = list[j]; if(exclude.indexOf(file.name.toUpperCase()) >= 0) @@ -112,6 +119,26 @@ for(var i in dir_list) { hash[file[hash_type]] = []; hash[file[hash_type]].push(file); } + if(options.arc) { + var contents = undefined; + try { + var contents = new Archive(file.path).list(options.hash); + contents.sort(function(a,b) { if(a.name < b.name) return -1; return a.name > b.name; } ); + for(var a = 0; a < contents.length; a++) { + delete contents[a].format; + delete contents[a].compression; + delete contents[a].mode; + if(options.hash) + delete contents[a].time; + } + } catch(e) { } + if(contents) { + var key = JSON.stringify(contents); + if(!arc[key]) + arc[key] = []; + arc[key].push(file); + } + } total_bytes += file.size; } base.close(); @@ -120,9 +147,10 @@ for(var i in dir_list) { log("Searching for duplicates in " + total_files + " files (" + file_size_str(total_bytes, /* unit */1, /* precision */1) + " bytes) ..."); -var dupe = { name: [], hash: []}; +var dupe = { name: [], hash: [], arc: []}; var name_bytes = 0; var hash_bytes = 0; +var arc_bytes = 0; for(var n in name) { var f = name[n]; if(f.length <= 1) @@ -141,6 +169,15 @@ for(var n in hash) { hash_bytes += f[i].size; } +for(var n in arc) { + var f = arc[n]; + if(f.length <= 1) + continue; + dupe.arc.push(f); + for(var i = 1; i < f.length; i++) + arc_bytes += f[i].size; +} + if(options.names) { log(dupe.name.length + " duplicate file names (" + file_size_str(name_bytes,1 , 1) + " bytes)"); if(options.dedupe) @@ -160,6 +197,15 @@ if(hash_type) { else print_list(dupe.hash, hash_type); } +if(options.arc) { + log(dupe.arc.length + " duplicate archives (" + file_size_str(arc_bytes,1 , 1) + " bytes)"); + if(options.dedupe) + writeln(remove_list(dupe.arc, "contents") + " files removed"); + else if(options.json) + writeln(JSON.stringify(dupe.arc, null, json_space)); + else + print_list(dupe.arc, "contents"); +} function print_list(list, key) { @@ -167,6 +213,8 @@ function print_list(list, key) var value = list[i][0][key]; if(key == 'crc32') value = format("%08X", value); + else if(!value) + value = list[i][0].name; writeln("Duplicate file " + key + " #" + (i + 1) + ": " + value); for(var j = 0; j < list[i].length; j++) { var file = list[i][j]; @@ -182,6 +230,8 @@ function remove_list(list, key) var value = list[i][0][key]; if(key == 'crc32') value = format("%08X", value); + else if(!value) + value = list[i][0].name; writeln("Duplicates of file " + key + " #" + (i + 1) + ": " + value); writeln(" Keeping " + list[i][0].path); for(var j = 1; j < list[i].length; j++) { -- GitLab