From d7c525ab729e51b1d305e6d08279640bf58c1b10 Mon Sep 17 00:00:00 2001 From: rswindell <> Date: Wed, 21 Nov 2007 01:00:10 +0000 Subject: [PATCH] Strip Ctrl-A codes before calculating hash of body text - for improved dupe detection. --- src/smblib/smbdefs.h | 26 ++++++++++++++------------ src/smblib/smbhash.c | 30 ++++++++++++++++++++++++------ 2 files changed, 38 insertions(+), 18 deletions(-) diff --git a/src/smblib/smbdefs.h b/src/smblib/smbdefs.h index 9edf519ccc..4151c18c4e 100644 --- a/src/smblib/smbdefs.h +++ b/src/smblib/smbdefs.h @@ -8,7 +8,7 @@ * @format.tab-size 4 (Plain Text/Source Code File Header) * * @format.use-tabs true (see http://www.synchro.net/ptsc_hdr.html) * * * - * Copyright 2006 Rob Swindell - http://www.synchro.net/copyright.html * + * Copyright 2007 Rob Swindell - http://www.synchro.net/copyright.html * * * * This program is free software; you can redistribute it and/or * * modify it under the terms of the GNU General Public License * @@ -433,17 +433,19 @@ typedef struct _PACK { /* Index record */ } idxrec_t; - /* valid bits in hash_t.flags */ -#define SMB_HASH_CRC16 (1<<0) /* CRC-16 hash is valid */ -#define SMB_HASH_CRC32 (1<<1) /* CRC-32 hash is valid */ -#define SMB_HASH_MD5 (1<<2) /* MD5 digest is valid */ -#define SMB_HASH_MASK (SMB_HASH_CRC16|SMB_HASH_CRC32|SMB_HASH_MD5) - -#define SMB_HASH_MARKED (1<<4) /* Used by smb_findhash() */ - -#define SMB_HASH_STRIP_WSP (1<<6) /* Strip white-space chars first */ -#define SMB_HASH_LOWERCASE (1<<7) /* Convert A-Z to a-z first */ -#define SMB_HASH_PROC_MASK (SMB_HASH_STRIP_WSP|SMB_HASH_LOWERCASE) + /* valid bits in hash_t.flags */ +#define SMB_HASH_CRC16 (1<<0) /* CRC-16 hash is valid */ +#define SMB_HASH_CRC32 (1<<1) /* CRC-32 hash is valid */ +#define SMB_HASH_MD5 (1<<2) /* MD5 digest is valid */ +#define SMB_HASH_MASK (SMB_HASH_CRC16|SMB_HASH_CRC32|SMB_HASH_MD5) + +#define SMB_HASH_MARKED (1<<4) /* Used by smb_findhash() */ + +#define SMB_HASH_STRIP_CTRL_A (1<<5) /* Strip Ctrl-A codes first */ +#define SMB_HASH_STRIP_WSP (1<<6) /* Strip white-space chars first */ +#define SMB_HASH_LOWERCASE (1<<7) /* Convert A-Z to a-z first */ +#define SMB_HASH_PROC_MASK (SMB_HASH_STRIP_CTRL_A|SMB_HASH_STRIP_WSP|SMB_HASH_LOWERCASE) +#define SMB_HASH_PROC_COMP_MASK (SMB_HASH_STRIP_WSP|SMB_HASH_LOWERCASE) enum { SMB_HASH_SOURCE_BODY diff --git a/src/smblib/smbhash.c b/src/smblib/smbhash.c index b0367a44de..60da7548df 100644 --- a/src/smblib/smbhash.c +++ b/src/smblib/smbhash.c @@ -8,7 +8,7 @@ * @format.tab-size 4 (Plain Text/Source Code File Header) * * @format.use-tabs true (see http://www.synchro.net/ptsc_hdr.html) * * * - * Copyright 2005 Rob Swindell - http://www.synchro.net/copyright.html * + * Copyright 2007 Rob Swindell - http://www.synchro.net/copyright.html * * * * This library is free software; you can redistribute it and/or * * modify it under the terms of the GNU Lesser General Public License * @@ -81,7 +81,7 @@ int SMBCALL smb_findhash(smb_t* smb, hash_t** compare, hash_t* found_hash, continue; /* wrong source length */ if(compare[c]->flags&SMB_HASH_MARKED) continue; /* already marked */ - if((compare[c]->flags&SMB_HASH_PROC_MASK)!=(hash.flags&SMB_HASH_PROC_MASK)) + if((compare[c]->flags&SMB_HASH_PROC_COMP_MASK)!=(hash.flags&SMB_HASH_PROC_COMP_MASK)) continue; /* wrong pre-process flags */ if((compare[c]->flags&hash.flags&SMB_HASH_MASK)==0) continue; /* no matching hashes */ @@ -93,7 +93,7 @@ int SMBCALL smb_findhash(smb_t* smb, hash_t** compare, hash_t* found_hash, continue; /* wrong crc-32 */ if(compare[c]->flags&hash.flags&SMB_HASH_MD5 && memcmp(compare[c]->md5,hash.md5,sizeof(hash.md5))) - continue; /* wrong crc-16 */ + continue; /* wrong MD5 */ /* successful match! */ break; /* can't match more than one, so stop comparing */ @@ -166,6 +166,22 @@ static char* strip_chars(uchar* dst, const uchar* src, uchar* set) return((char *)dst); } +static char* strip_ctrla(uchar* dst, const uchar* src) +{ + while(*src) { + if(*src==CTRL_A) { + src++; + if(*src) + src++; + } + else + *(dst++)=*(src++); + } + *dst=0; + + return((char *)dst); +} + /* Allocates and calculates hashes of data (based on flags) */ /* Returns NULL on failure */ hash_t* SMBCALL smb_hash(ulong msgnum, ulong t, unsigned source, unsigned flags @@ -204,8 +220,10 @@ hash_t* SMBCALL smb_hashstr(ulong msgnum, ulong t, unsigned source, unsigned fla if(flags&SMB_HASH_PROC_MASK) { /* string pre-processing */ if((p=strdup(str))==NULL) return(NULL); + if(flags&SMB_HASH_STRIP_CTRL_A) + strip_ctrla(p,p); if(flags&SMB_HASH_STRIP_WSP) - strip_chars(p,str," \t\r\n"); + strip_chars(p,p," \t\r\n"); if(flags&SMB_HASH_LOWERCASE) strlwr(p); } @@ -218,7 +236,7 @@ hash_t* SMBCALL smb_hashstr(ulong msgnum, ulong t, unsigned source, unsigned fla return(hash); } -/* Allocatese and calculates all hashes for a single message */ +/* Allocates and calculates all hashes for a single message */ /* Returns NULL on failure */ hash_t** SMBCALL smb_msghashes(smbmsg_t* msg, const uchar* body) { @@ -241,7 +259,7 @@ hash_t** SMBCALL smb_msghashes(smbmsg_t* msg, const uchar* body) (hash=smb_hashstr(msg->hdr.number, t, SMB_HASH_SOURCE_FTN_ID, flags, msg->ftn_msgid))!=NULL) hashes[h++]=hash; - flags|=SMB_HASH_STRIP_WSP; + flags|=SMB_HASH_STRIP_WSP|SMB_HASH_STRIP_CTRL_A; if(body!=NULL && (hash=smb_hashstr(msg->hdr.number, t, SMB_HASH_SOURCE_BODY, flags, body))!=NULL) hashes[h++]=hash; -- GitLab