Skip to content
Snippets Groups Projects
Commit 302a628a authored by Rob Swindell's avatar Rob Swindell :speech_balloon:
Browse files

Support find/remove of duplicates based on archive contents

Duplicate archives (e.g. ZIP files) often have the different digest/hash/CRC
values because they have a different ZIP comment or the files were archived
in a different order or used different compression schemes. You can now detect
(and delete) these duplicates.
parent 1e1e648c
No related branches found
No related tags found
No related merge requests found
......@@ -27,11 +27,15 @@ for(var i = 0; i < argc; i++) {
writeln(" -min=<bytes> specify minimum file size to compare hash/sum");
writeln(" -ex=<filename> add to excluded file name list (case-insensitive)");
writeln(" -names search for duplicate file names (case-insensitive)");
writeln(" -arc search for duplicate contents of archive files");
writeln(" -hash hash each archived file contents for comparison");
writeln(" -sha1 search for duplicate SHA-1 sums (the default)");
writeln(" -crc32 search for duplicate CRC-32 sums");
writeln(" -md5 search for duplicate MD5 sums");
writeln(" -json[=space] create JSON formatted output");
writeln(" -v increase verbosity of JSON output");
writeln(" -sort sort each directory file list");
writeln(" -reverse reverse each directory file list");
writeln(" -dedupe remove/delete duplicate files");
exit(0);
}
......@@ -80,12 +84,13 @@ for(var i = 0; i < argc; i++) {
if(dir_list.length < 1)
for(var dir in file_area.dir)
dir_list.push(dir);
if(!options.names && !hash_type)
if(!options.names && !options.arc && !hash_type)
hash_type = "sha1";
log("Reading file areas...");
var name = {};
var hash = {};
var arc = {};
var total_files = 0;
var total_bytes = 0;
for(var i in dir_list) {
......@@ -94,7 +99,9 @@ for(var i in dir_list) {
var base = new FileBase(dir_code);
if(!base.open())
throw new Error(base.last_error);
var list = base.get_list(detail, /* sort: */false);
var list = base.get_list(detail, options.sort);
if(options.reverse)
list.reverse();
for(var j = 0; j < list.length; j++) {
var file = list[j];
if(exclude.indexOf(file.name.toUpperCase()) >= 0)
......@@ -112,6 +119,26 @@ for(var i in dir_list) {
hash[file[hash_type]] = [];
hash[file[hash_type]].push(file);
}
if(options.arc) {
var contents = undefined;
try {
var contents = new Archive(file.path).list(options.hash);
contents.sort(function(a,b) { if(a.name < b.name) return -1; return a.name > b.name; } );
for(var a = 0; a < contents.length; a++) {
delete contents[a].format;
delete contents[a].compression;
delete contents[a].mode;
if(options.hash)
delete contents[a].time;
}
} catch(e) { }
if(contents) {
var key = JSON.stringify(contents);
if(!arc[key])
arc[key] = [];
arc[key].push(file);
}
}
total_bytes += file.size;
}
base.close();
......@@ -120,9 +147,10 @@ for(var i in dir_list) {
log("Searching for duplicates in " + total_files + " files ("
+ file_size_str(total_bytes, /* unit */1, /* precision */1) + " bytes) ...");
var dupe = { name: [], hash: []};
var dupe = { name: [], hash: [], arc: []};
var name_bytes = 0;
var hash_bytes = 0;
var arc_bytes = 0;
for(var n in name) {
var f = name[n];
if(f.length <= 1)
......@@ -141,6 +169,15 @@ for(var n in hash) {
hash_bytes += f[i].size;
}
for(var n in arc) {
var f = arc[n];
if(f.length <= 1)
continue;
dupe.arc.push(f);
for(var i = 1; i < f.length; i++)
arc_bytes += f[i].size;
}
if(options.names) {
log(dupe.name.length + " duplicate file names (" + file_size_str(name_bytes,1 , 1) + " bytes)");
if(options.dedupe)
......@@ -160,6 +197,15 @@ if(hash_type) {
else
print_list(dupe.hash, hash_type);
}
if(options.arc) {
log(dupe.arc.length + " duplicate archives (" + file_size_str(arc_bytes,1 , 1) + " bytes)");
if(options.dedupe)
writeln(remove_list(dupe.arc, "contents") + " files removed");
else if(options.json)
writeln(JSON.stringify(dupe.arc, null, json_space));
else
print_list(dupe.arc, "contents");
}
function print_list(list, key)
{
......@@ -167,6 +213,8 @@ function print_list(list, key)
var value = list[i][0][key];
if(key == 'crc32')
value = format("%08X", value);
else if(!value)
value = list[i][0].name;
writeln("Duplicate file " + key + " #" + (i + 1) + ": " + value);
for(var j = 0; j < list[i].length; j++) {
var file = list[i][j];
......@@ -182,6 +230,8 @@ function remove_list(list, key)
var value = list[i][0][key];
if(key == 'crc32')
value = format("%08X", value);
else if(!value)
value = list[i][0].name;
writeln("Duplicates of file " + key + " #" + (i + 1) + ": " + value);
writeln(" Keeping " + list[i][0].path);
for(var j = 1; j < list[i].length; j++) {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment