Skip to content
Snippets Groups Projects
Commit 5cf7ab48 authored by Rob Swindell's avatar Rob Swindell :speech_balloon:
Browse files

Support find/remove of duplicates based on archive contents

Duplicate archives (e.g. ZIP files) often have the different digest/hash/CRC
values because they have a different ZIP comment or the files were archived
in a different order or used different compression schemes. You can now detect
(and delete) these duplicates.
parent ee41c8b6
No related branches found
No related tags found
1 merge request!463MRC mods by Codefenix (2024-10-20)
......@@ -27,11 +27,15 @@ for(var i = 0; i < argc; i++) {
writeln(" -min=<bytes> specify minimum file size to compare hash/sum");
writeln(" -ex=<filename> add to excluded file name list (case-insensitive)");
writeln(" -names search for duplicate file names (case-insensitive)");
writeln(" -arc search for duplicate contents of archive files");
writeln(" -hash hash each archived file contents for comparison");
writeln(" -sha1 search for duplicate SHA-1 sums (the default)");
writeln(" -crc32 search for duplicate CRC-32 sums");
writeln(" -md5 search for duplicate MD5 sums");
writeln(" -json[=space] create JSON formatted output");
writeln(" -v increase verbosity of JSON output");
writeln(" -sort sort each directory file list");
writeln(" -reverse reverse each directory file list");
writeln(" -dedupe remove/delete duplicate files");
exit(0);
}
......@@ -80,12 +84,13 @@ for(var i = 0; i < argc; i++) {
if(dir_list.length < 1)
for(var dir in file_area.dir)
dir_list.push(dir);
if(!options.names && !hash_type)
if(!options.names && !options.arc && !hash_type)
hash_type = "sha1";
log("Reading file areas...");
var name = {};
var hash = {};
var arc = {};
var total_files = 0;
var total_bytes = 0;
for(var i in dir_list) {
......@@ -94,7 +99,9 @@ for(var i in dir_list) {
var base = new FileBase(dir_code);
if(!base.open())
throw new Error(base.last_error);
var list = base.get_list(detail, /* sort: */false);
var list = base.get_list(detail, options.sort);
if(options.reverse)
list.reverse();
for(var j = 0; j < list.length; j++) {
var file = list[j];
if(exclude.indexOf(file.name.toUpperCase()) >= 0)
......@@ -112,6 +119,26 @@ for(var i in dir_list) {
hash[file[hash_type]] = [];
hash[file[hash_type]].push(file);
}
if(options.arc) {
var contents = undefined;
try {
var contents = new Archive(file.path).list(options.hash);
contents.sort(function(a,b) { if(a.name < b.name) return -1; return a.name > b.name; } );
for(var a = 0; a < contents.length; a++) {
delete contents[a].format;
delete contents[a].compression;
delete contents[a].mode;
if(options.hash)
delete contents[a].time;
}
} catch(e) { }
if(contents) {
var key = JSON.stringify(contents);
if(!arc[key])
arc[key] = [];
arc[key].push(file);
}
}
total_bytes += file.size;
}
base.close();
......@@ -120,9 +147,10 @@ for(var i in dir_list) {
log("Searching for duplicates in " + total_files + " files ("
+ file_size_str(total_bytes, /* unit */1, /* precision */1) + " bytes) ...");
var dupe = { name: [], hash: []};
var dupe = { name: [], hash: [], arc: []};
var name_bytes = 0;
var hash_bytes = 0;
var arc_bytes = 0;
for(var n in name) {
var f = name[n];
if(f.length <= 1)
......@@ -141,6 +169,15 @@ for(var n in hash) {
hash_bytes += f[i].size;
}
for(var n in arc) {
var f = arc[n];
if(f.length <= 1)
continue;
dupe.arc.push(f);
for(var i = 1; i < f.length; i++)
arc_bytes += f[i].size;
}
if(options.names) {
log(dupe.name.length + " duplicate file names (" + file_size_str(name_bytes,1 , 1) + " bytes)");
if(options.dedupe)
......@@ -160,6 +197,15 @@ if(hash_type) {
else
print_list(dupe.hash, hash_type);
}
if(options.arc) {
log(dupe.arc.length + " duplicate archives (" + file_size_str(arc_bytes,1 , 1) + " bytes)");
if(options.dedupe)
writeln(remove_list(dupe.arc, "contents") + " files removed");
else if(options.json)
writeln(JSON.stringify(dupe.arc, null, json_space));
else
print_list(dupe.arc, "contents");
}
function print_list(list, key)
{
......@@ -167,6 +213,8 @@ function print_list(list, key)
var value = list[i][0][key];
if(key == 'crc32')
value = format("%08X", value);
else if(!value)
value = list[i][0].name;
writeln("Duplicate file " + key + " #" + (i + 1) + ": " + value);
for(var j = 0; j < list[i].length; j++) {
var file = list[i][j];
......@@ -182,6 +230,8 @@ function remove_list(list, key)
var value = list[i][0][key];
if(key == 'crc32')
value = format("%08X", value);
else if(!value)
value = list[i][0].name;
writeln("Duplicates of file " + key + " #" + (i + 1) + ": " + value);
writeln(" Keeping " + list[i][0].path);
for(var j = 1; j < list[i].length; j++) {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment