Skip to content
Snippets Groups Projects
Commit adef4e98 authored by rswindell's avatar rswindell
Browse files

Hash types are now 0-31 (no longer re-using header field type values for RFC822

MSGID's and FTN MSGID's) so that a bit-mask may be passed to functions like
smb_findhash(), allowing selective duplicate searches without pre-marking
hashes (to be skipped) in a message's hash list. This limits the number of
possible hash sources to 32 total (down from 256) - not a real concern.

This change will render existing msg-id's in hash files obsolete.

The usage of the following functions were effected:
smb_addmsg() - from BOOL (dupechk) to bit-field (dupechk_hashes)
smb_msghashes() - removed dupechk kludge (to pre-mark text hash)
smb_findhash() - added source_mask argument to select which source types
parent 4ceb3ea1
No related branches found
No related tags found
No related merge requests found
......@@ -42,7 +42,7 @@
/****************************************************************************/
/****************************************************************************/
int SMBCALL smb_addmsg(smb_t* smb, smbmsg_t* msg, int storage, BOOL dupechk
int SMBCALL smb_addmsg(smb_t* smb, smbmsg_t* msg, int storage, long dupechk_hashes
,ushort xlat, const uchar* body, const uchar* tail)
{
uchar* lzhbuf=NULL;
......@@ -82,9 +82,9 @@ int SMBCALL smb_addmsg(smb_t* smb, smbmsg_t* msg, int storage, BOOL dupechk
msg->hdr.number=smb->status.last_msg+1;
hashes=smb_msghashes(msg,body,dupechk);
hashes=smb_msghashes(msg,body);
if(smb_findhash(smb, hashes, &found, /* update? */FALSE)==SMB_SUCCESS) {
if(smb_findhash(smb, hashes, &found, dupechk_hashes, /* mark? */FALSE)==SMB_SUCCESS) {
safe_snprintf(smb->last_error,sizeof(smb->last_error)
,"duplicate %s: %s found in message #%lu"
,smb_hashsourcetype(found.source)
......@@ -104,7 +104,7 @@ int SMBCALL smb_addmsg(smb_t* smb, smbmsg_t* msg, int storage, BOOL dupechk
bodylen--;
/* Calculate CRC-32 of message text (before encoding, if any) */
if(smb->status.max_crcs && dupechk) {
if(smb->status.max_crcs && dupechk_hashes&SMB_HASH_SOURCE_BODY) {
for(l=0;l<bodylen;l++)
crc=ucrc32(body[l],crc);
crc=~crc;
......
......@@ -467,11 +467,25 @@ typedef struct _PACK { /* Index record */
#define SMB_HASH_LOWERCASE (1<<7) /* Convert A-Z to a-z first */
#define SMB_HASH_PROC_MASK (SMB_HASH_STRIP_WSP|SMB_HASH_LOWERCASE)
enum {
SMB_HASH_SOURCE_BODY
,SMB_HASH_SOURCE_MSG_ID
,SMB_HASH_SOURCE_FTN_ID
/* Add new ones here (max value of 31) */
,SMB_HASH_SOURCE_TYPES
};
#define SMB_HASH_SOURCE_MASK 0x1f
#define SMB_HASH_SOURCE_NONE 0
#define SMB_HASH_SOURCE_ALL 0xff
typedef struct _PACK {
ulong number; /* Message number */
ulong time; /* Local time of fingerprinting */
uchar source; /* (e.g. TEXT_BODY, RFC822MSGID, FIDOMSGID) */
uchar source; /* SMB_HASH_SOURCE* (in low 5-bits) */
uchar flags; /* indications of valid hashes and pre-processing */
ushort crc16; /* CRC-16 of source */
ulong crc32; /* CRC-32 of source */
......
......@@ -43,7 +43,8 @@
#include "genwrap.h"
/* If return value is SMB_ERROR_NOT_FOUND, hash file is left open */
int SMBCALL smb_findhash(smb_t* smb, hash_t** compare, hash_t* found_hash, BOOL mark)
int SMBCALL smb_findhash(smb_t* smb, hash_t** compare, hash_t* found_hash,
long source_mask, BOOL mark)
{
int retval;
BOOL found=FALSE;
......@@ -69,6 +70,9 @@ int SMBCALL smb_findhash(smb_t* smb, hash_t** compare, hash_t* found_hash, BOOL
if(hash.flags==0)
continue; /* invalid hash record (!?) */
if((source_mask&(1<<hash.source))==0) /* not checking this source type */
continue;
for(c=0;compare[c]!=NULL;c++) {
if(compare[c]->source!=hash.source)
......@@ -215,7 +219,7 @@ hash_t* SMBCALL smb_hashstr(ulong msgnum, ulong t, unsigned source, unsigned fla
/* Allocatese and calculates all hashes for a single message */
/* Returns NULL on failure */
hash_t** SMBCALL smb_msghashes(smbmsg_t* msg, const uchar* text, BOOL dupechk)
hash_t** SMBCALL smb_msghashes(smbmsg_t* msg, const uchar* body)
{
size_t h=0;
uchar flags=SMB_HASH_CRC16|SMB_HASH_CRC32|SMB_HASH_MD5;
......@@ -230,19 +234,17 @@ hash_t** SMBCALL smb_msghashes(smbmsg_t* msg, const uchar* text, BOOL dupechk)
memset(hashes, 0, sizeof(hash_t*)*SMB_MAX_HASH_COUNT);
if(msg->id!=NULL
&& (hash=smb_hashstr(msg->hdr.number, t, RFC822MSGID, flags, msg->id))!=NULL)
if(msg->id!=NULL &&
(hash=smb_hashstr(msg->hdr.number, t, SMB_HASH_SOURCE_MSG_ID, flags, msg->id))!=NULL)
hashes[h++]=hash;
if(msg->ftn_msgid!=NULL
&& (hash=smb_hashstr(msg->hdr.number, t, FIDOMSGID, flags, msg->ftn_msgid))!=NULL)
if(msg->ftn_msgid!=NULL &&
(hash=smb_hashstr(msg->hdr.number, t, SMB_HASH_SOURCE_FTN_ID, flags, msg->ftn_msgid))!=NULL)
hashes[h++]=hash;
flags|=SMB_HASH_STRIP_WSP;
if(!dupechk)
flags|=SMB_HASH_MARKED; /* ignore for dupe checks */
if(text!=NULL
&& (hash=smb_hashstr(msg->hdr.number, t, TEXT_BODY, flags, text))!=NULL)
if(body!=NULL &&
(hash=smb_hashstr(msg->hdr.number, t, SMB_HASH_SOURCE_BODY, flags, body))!=NULL)
hashes[h++]=hash;
return(hashes);
......@@ -256,9 +258,9 @@ int SMBCALL smb_hashmsg(smb_t* smb, smbmsg_t* msg, const uchar* text, BOOL updat
hash_t found;
hash_t** hashes; /* This is a NULL-terminated list of hashes */
hashes=smb_msghashes(msg,text,/* dupechk? */TRUE);
hashes=smb_msghashes(msg,text);
if(smb_findhash(smb, hashes, &found, update)==SMB_SUCCESS && !update) {
if(smb_findhash(smb, hashes, &found, SMB_HASH_SOURCE_ALL, update)==SMB_SUCCESS && !update) {
retval=SMB_DUPE_MSG;
safe_snprintf(smb->last_error,sizeof(smb->last_error)
,"duplicate %s: %s found in message #%lu"
......@@ -296,7 +298,7 @@ int SMBCALL smb_getmsgidx_by_hash(smb_t* smb, smbmsg_t* msg, unsigned source
hashes[1]=NULL; /* terminate list */
memset(&found,0,sizeof(found));
if((retval=smb_findhash(smb, hashes, &found, FALSE))==SMB_SUCCESS) {
if((retval=smb_findhash(smb, hashes, &found, 1<<source, FALSE))==SMB_SUCCESS) {
if(found.number==0)
retval=SMB_FAILURE; /* use better error value here? */
else {
......
......@@ -1367,7 +1367,7 @@ int SMBCALL smb_addmsghdr(smb_t* smb, smbmsg_t* msg, int storage)
msg->idx.number=msg->hdr.number=smb->status.last_msg+1;
if(!(msg->flags&MSG_FLAG_HASHED) /* not already hashed */
&& (i=smb_hashmsg(smb,msg,NULL,FALSE))!=SMB_SUCCESS) {
&& (i=smb_hashmsg(smb,msg,NULL,/* update? */FALSE))!=SMB_SUCCESS) {
smb_unlocksmbhdr(smb);
return(i); /* Duplicate message? */
}
......
......@@ -147,7 +147,7 @@ SMBEXPORT int SMBCALL smb_updatethread(smb_t* smb, smbmsg_t* remsg, ulong newms
SMBEXPORT BOOL SMBCALL smb_valid_hdr_offset(smb_t* smb, ulong offset);
/* smbadd.c */
SMBEXPORT int SMBCALL smb_addmsg(smb_t* smb, smbmsg_t* msg, int storage, BOOL dupechk
SMBEXPORT int SMBCALL smb_addmsg(smb_t* smb, smbmsg_t* msg, int storage, long dupechk_hashes
,ushort xlat, const uchar* body, const uchar* tail);
/* smballoc.c */
......@@ -166,14 +166,15 @@ SMBEXPORT int SMBCALL smb_freemsghdr(smb_t* smb, ulong offset, ulong length);
SMBEXPORT void SMBCALL smb_freemsgtxt(char* buf);
/* smbhash.c */
SMBEXPORT int SMBCALL smb_findhash(smb_t* smb, hash_t** compare_list, hash_t* found, BOOL mark);
SMBEXPORT int SMBCALL smb_findhash(smb_t* smb, hash_t** compare_list, hash_t* found
,long source_mask, BOOL mark);
SMBEXPORT int SMBCALL smb_hashmsg(smb_t* smb, smbmsg_t* msg, const uchar* text, BOOL update);
SMBEXPORT hash_t* SMBCALL smb_hash(ulong msgnum, ulong time, unsigned source
,unsigned flags, const void* data, size_t length);
SMBEXPORT hash_t* SMBCALL smb_hashstr(ulong msgnum, ulong time, unsigned source
,unsigned flags, const char* str);
SMBEXPORT hash_t** SMBCALL smb_msghashes(smbmsg_t* msg, const uchar* text, BOOL dupechk);
SMBEXPORT hash_t** SMBCALL smb_msghashes(smbmsg_t* msg, const uchar* text);
SMBEXPORT int SMBCALL smb_addhashes(smb_t* smb, hash_t** hash_list, BOOL skip_marked);
SMBEXPORT ushort SMBCALL smb_subject_crc(const char *subj);
......@@ -191,13 +192,13 @@ SMBEXPORT int SMBCALL smb_getmsghdr_by_hash(smb_t* smb, smbmsg_t* msg, unsigne
/* Fast Message-ID based look-up macros (using hashes) */
#define smb_getmsgidx_by_msgid(smb, msg, id) \
smb_getmsgidx_by_hashstr(smb, msg, RFC822MSGID, SMB_HASH_MASK, id)
smb_getmsgidx_by_hashstr(smb, msg, SMB_HASH_SOURCE_MSG_ID, SMB_HASH_MASK, id)
#define smb_getmsgidx_by_ftnid(smb, msg, id) \
smb_getmsgidx_by_hashstr(smb, msg, FIDOMSGID, SMB_HASH_MASK, id)
smb_getmsgidx_by_hashstr(smb, msg, SMB_HASH_SOURCE_FTN_ID, SMB_HASH_MASK, id)
#define smb_getmsghdr_by_msgid(smb, msg, id) \
smb_getmsghdr_by_hashstr(smb, msg, RFC822MSGID, SMB_HASH_MASK, id)
smb_getmsghdr_by_hashstr(smb, msg, SMB_HASH_SOURCE_MSG_ID, SMB_HASH_MASK, id)
#define smb_getmsghdr_by_ftnid(smb, msg, id) \
smb_getmsghdr_by_hashstr(smb, msg, FIDOMSGID, SMB_HASH_MASK, id)
smb_getmsghdr_by_hashstr(smb, msg, SMB_HASH_SOURCE_FTN_ID, SMB_HASH_MASK, id)
/* smbstr.c */
SMBEXPORT char* SMBCALL smb_hfieldtype(ushort type);
......
......@@ -137,17 +137,23 @@ char* SMBCALL smb_dfieldtype(ushort type)
char* SMBCALL smb_hashsourcetype(uchar type)
{
if(type==TEXT_BODY || type==TEXT_TAIL)
return(smb_dfieldtype(type));
return(smb_hfieldtype(type));
static char str[8];
switch(type) {
case SMB_HASH_SOURCE_BODY: return(smb_dfieldtype(TEXT_BODY));
case SMB_HASH_SOURCE_MSG_ID: return(smb_hfieldtype(RFC822MSGID));
case SMB_HASH_SOURCE_FTN_ID: return(smb_hfieldtype(FIDOMSGID));
}
sprintf(str,"%02Xh",type);
return(str);
}
char* SMBCALL smb_hashsource(smbmsg_t* msg, int source)
{
switch(source) {
case RFC822MSGID:
case SMB_HASH_SOURCE_MSG_ID:
return(msg->id);
case FIDOMSGID:
case SMB_HASH_SOURCE_FTN_ID:
return(msg->ftn_msgid);
}
return("hash");
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment