Logo Search packages:      
Sourcecode: dcc version File versions  Download package

ckfuz2.c

/* Distributed Checksum Clearinghouse
 *
 * compute fuzzy body checksum #2
 *
 * Copyright (c) 2005 by Rhyolite Software
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND RHYOLITE SOFTWARE DISCLAIMS ALL
 * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL RHYOLITE SOFTWARE
 * BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES
 * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
 * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
 * SOFTWARE.
 *
 * Rhyolite Software DCC 1.2.74-1.37 $Revision$
 */

#include "ck.h"

#include "ckfuz2_tbl.h"
struct {
    const char **words;
    u_int   len;
    const u_char *cset;
} tbls[FUZ2_LAN_NUM] = {
    {word_tbl0, word_tbl0_LEN, 0},
    {word_tbl1, word_tbl1_LEN, 0},
    {word_tbl2, word_tbl2_LEN, dcc_cset_2},
};


#define FZ2 cks->fuz2

#define BUF_LEN 1024
typedef struct {
    char buf[BUF_LEN+sizeof(DCC_FUZ2_WORD)+1];
    int blen;
} LBUF;


#ifdef DCC_CKSUM_DEBUG
#define FUZ2(lp, b, l) (dcc_clnt_debug == 5                       \
                  ? (write(1, b, l), MD5Update(&lp->md5, b, l))   \
                  : MD5Update(&lp->md5, b, l))
#else
#define FUZ2(lp, b, l) MD5Update(&lp->md5, b, l)
#endif


void
dcc_ck_fuz2_init(DCC_GOT_CKS *cks)
{
      FUZ2_LANG *lp;

      FZ2.wlen = 0;
      DCC_FUZ2_WORD_CLEAR(&FZ2.w);
      FZ2.st = DCC_FUZ2_ST_WORD;
      FZ2.url_cp = FZ2.url_buf;
      FZ2.urls = 0;

      cks->sums[DCC_CK_FUZ2].type = DCC_CK_FUZ2;
      cks->sums[DCC_CK_FUZ2].tgts = DCC_TGTS_INVALID;
      cks->sums[DCC_CK_FUZ2].rpt = 0;

      FZ2.btotal = 0;
      FZ2.xsummed = 0;
      for (lp = FZ2.lang; lp <= LAST(FZ2.lang); ++lp) {
            lp->wsummed = 0;
            lp->wtotal = 0;
            MD5Init(&lp->md5);
      }
}



static inline u_char                /* 1=found it, 0=not a known word */
lookup_word(const DCC_FUZ2_WORD *w, u_int wlen,
          const char **word_tbl, u_int word_tbl_len)
{
      const char *p;
      u_int n;

      p = word_tbl[fuz2_word_hash(w, word_tbl_len)];
      if (!p)
            return 0;
      for (;;) {
            n = *p++;
            if (!n)
                  return 0;
            if (n == wlen && !memcmp(w->b, p, n))
                  return 1;
            p += n;
      }
}



static void
add_word(DCC_GOT_CKS *cks, LBUF *lbp)
{
      FUZ2_LANG *lp;
      int tbl;

      tbl = 0;
      for (lp = FZ2.lang; lp < &FZ2.lang[FUZ2_LAN_NUM]; ++lp, ++lbp, ++tbl) {
            if (tbls[tbl].cset != 0
                && tbls[tbl].cset != cks->mime_cset)
                  continue;
            if (lookup_word(&FZ2.w, FZ2.wlen,
                        tbls[tbl].words, tbls[tbl].len)) {
                  ++lp->wtotal;
                  memcpy(&lbp->buf[lbp->blen], &FZ2.w, FZ2.wlen);
                  if ((lbp->blen += FZ2.wlen) >= BUF_LEN) {
                        lp->wsummed += lbp->blen;
                        FUZ2(lp, lbp->buf, lbp->blen);
                        lbp->blen = 0;
                  }
            }
      }
}



/* resolve an HTML characater reference */
static u_char                       /* 0 or found value */
lookup_cref(DCC_FUZ2_WORD *w, u_int clen)
{
      const char *p;
      u_int n;

      if (clen > sizeof(DCC_FUZ2_WORD)
          || clen == 0)
            return 0;
      p = cref_tbl[fuz2_word_hash(w, cref_tbl_LEN)];
      if (!p)
            return 0;
      for (;;) {
            n = *p++;
            if (!n)
                  return 0;
            if (n == clen && !memcmp(w->b, p, n))
                  return p[clen];
            p += n+1;
      }
}



static int
adv_cref(DCC_GOT_CKS *cks, u_char c)
{
      switch (FZ2.cref_st) {
      default:
            dcc_logbad(EX_SOFTWARE, "impossible fuz2 cref state");
            break;

      case DCC_CREF_ST_START:
            /* start to get a possible HTML character reference
             * We have already passed the '&' */
            if (c == '#') {
                  FZ2.cref_st = DCC_CREF_ST_NUM;
            } else if ((c >= 'a' && c <= 'z')
                     || (c >= 'A' && c <= 'Z')) {
                  DCC_FUZ2_WORD_CLEAR(&FZ2.cref_w);
                  FZ2.cref_w.b[0] = c;
                  FZ2.cref_cnt = 1;
                  FZ2.cref_st = DCC_CREF_ST_NAME;
            } else {
                  FZ2.cref_st = DCC_CREF_ST_IDLE;
                  return -2;
            }
            break;

      case DCC_CREF_ST_NUM:
            /* look for 'x' or the first digit */
            if (c == 'x' || c == 'X') {
                  FZ2.cref_st = DCC_CREF_ST_HEX;
                  FZ2.cref_cnt = 0;
            } else if (c >= '0' && c <= '9') {
                  FZ2.cref_cnt = c - '0';
                  FZ2.cref_st = DCC_CREF_ST_DEC;
            } else {
                  FZ2.cref_st = DCC_CREF_ST_IDLE;
                  return -2;
            }
            break;

      case DCC_CREF_ST_DEC:
            if (c >= '0' && c <= '9') {
                  FZ2.cref_cnt = FZ2.cref_cnt*10 + (c - '0');
            } else if (cks->mime_cset[c] == FC_SP) {
                  FZ2.cref_st = DCC_CREF_ST_IDLE;
                  return c == ';';
            } else {
                  FZ2.cref_st = DCC_CREF_ST_IDLE;
                  return -2;
            }
            if (FZ2.cref_cnt > 256)
                  FZ2.cref_cnt = 256;
            ++FZ2.btotal;
            break;

      case DCC_CREF_ST_HEX:
            if ((c >= 'a' && c <= 'f')
                || (c >= 'A' && c <= 'F')) {
                  FZ2.cref_cnt = (FZ2.cref_cnt<<4) + (c & 0xf) + 9;
            } else if (c >= '0' && c <= '9') {
                  FZ2.cref_cnt = (FZ2.cref_cnt<<4) + (c - '0');
            } else if (cks->mime_cset[c] == FC_SP) {
                  FZ2.cref_st = DCC_CREF_ST_IDLE;
                  return c == ';';
            } else {
                  FZ2.cref_st = DCC_CREF_ST_IDLE;
                  return -2;
            }
            if (FZ2.cref_cnt > 256)
                  FZ2.cref_cnt = 256;
            ++FZ2.btotal;
            break;

      case DCC_CREF_ST_NAME:
            if ((c >= 'a' && c <= 'z')
                || (c >= 'A' && c <= 'Z')) {
                  /* If the word is too long, the final match will fail.
                   * This will consume it */
                  if (FZ2.cref_cnt < sizeof(DCC_FUZ2_WORD))
                        FZ2.cref_w.b[FZ2.cref_cnt++] = c;
            } else if (cks->mime_cset[c] == FC_SP) {
                  /* this character ends the cref */
                  FZ2.cref_cnt = lookup_cref(&FZ2.cref_w, FZ2.cref_cnt);
                  FZ2.cref_st = DCC_CREF_ST_IDLE;
                  return c == ';';
            } else {
                  FZ2.cref_st = DCC_CREF_ST_IDLE;
                  return -2;
            }
            break;
      }
      return -1;
}


void
dcc_ck_fuz2(DCC_GOT_CKS *cks, const char *bp, u_int bp_len)
{
#define SKIP_WORD() (FZ2.wlen = sizeof(DCC_FUZ2_WORD)+1)
#define JUNK() (SKIP_WORD(), FZ2.st = DCC_FUZ2_ST_WORD)
      LBUF *lbp, lbufs[FUZ2_LAN_NUM];
      FUZ2_LANG *lp;
      int i;
      char *p;
      u_char c;

      if (cks->sums[DCC_CK_FUZ2].type != DCC_CK_FUZ2)
            return;

      for (lbp = lbufs; lbp <= LAST(lbufs); ++lbp)
            lbp->blen = 0;

      while (bp_len != 0) {
            switch (FZ2.st) {
            case DCC_FUZ2_ST_WORD:
                  /* gathering a word */
                  do {
                        if (FZ2.cref_st == DCC_CREF_ST_IDLE) {
                              --bp_len;
                              c = *bp++;
                              c = cks->mime_cset[c];
                              if (c == FC_CF) {
                                  if (cks->mime_ct == DCC_CK_CT_HTML){
                                    FZ2.cref_st = DCC_CREF_ST_START;
                                    break;
                                  }
                                  c = FC_SP;
                              }
                        } else {
                              i = adv_cref(cks, *bp);
                              if (i == -2) {
                                  JUNK(); /* bogus cref */
                                  continue;
                              }
                              if (i < 0) {      /* get more of cref */
                                  --bp_len;
                                  ++bp;
                                  continue;
                              }
                              bp += i;    /* use complete cref */
                              bp_len -= i;
                              c = cks->mime_cset[FZ2.cref_cnt];

                              if (c == FC_SK)
                                  continue;   /* ignore accent mark */
                        }

                        if (c >= FC_A) {
                              ++FZ2.btotal;
                              if (FZ2.wlen < sizeof(DCC_FUZ2_WORD))
                                  FZ2.w.b[FZ2.wlen++] = c;
                              else
                                  SKIP_WORD();
                              continue;
                        }

                        if (c == FC_SP) {
                              if (FZ2.wlen >= MIN_WLEN
                                  && FZ2.wlen <=sizeof(DCC_FUZ2_WORD))
                                  add_word(cks, lbufs);
                              FZ2.wlen = 0;
                              DCC_FUZ2_WORD_CLEAR(&FZ2.w);
                              continue;
                        }
                        ++FZ2.btotal;

                        if (c == FC_LT) {
                              FZ2.tag_len = 0;
                              DCC_FUZ2_WORD_CLEAR(&FZ2.tag);
                              FZ2.st = DCC_FUZ2_ST_START_TAG;
                              break;
                        }

                        JUNK();
                  } while (bp_len != 0);
                  break;

            case DCC_FUZ2_ST_START_TAG:
                  /* collecting an HTML tag or comment
                   * We've passed the '<' */
                  c = *bp;
#define SAVE_TAG(_c) (FZ2.tag.b[FZ2.tag_len++] = _c, \
                  ++FZ2.btotal, ++bp, --bp_len)
                  if (((c >= 'a' && c <= 'z')   /* tag */
                       || (c >= '0' && c <= '9'))
                      && FZ2.tag_len < sizeof(FZ2.tag)) {
                        SAVE_TAG(c);
                        break;
                  }
                  if (c >= 'A' && c <= 'Z'
                      && FZ2.tag_len < sizeof(FZ2.tag)) {
                        SAVE_TAG(c - ('A'-'a'));
                        break;
                  }
                  if ((c == '/'     /* end-tag */
                       || c == '!')   /* start of comment */
                      && FZ2.tag_len == 0) {
                        SAVE_TAG(c);
                        break;
                  }
                  if (c == '-'    /* comment */
                      && FZ2.tag_len >= 1 && FZ2.tag_len <= 2) {
                        SAVE_TAG(c);
                        break;
                  }
#undef SAVE_TAG

                  /* notice an <html> tag while in text/plain
                   * and switch to text/html */
                  if (FZ2.tag_len == 4
                      && cks->mime_ct != DCC_CK_CT_HTML
                      && !memcmp(FZ2.tag.b, "html", 4))
                        cks->mime_ct = DCC_CK_CT_HTML;

                  if (cks->mime_ct == DCC_CK_CT_HTML
                      && FZ2.tag_len > 0) {
                        /* if we are in an HTML document and we
                         * have at least one character after '<',
                         * assume it is some kind of HTML tag */
                        FZ2.xsummed += FZ2.tag_len+1; /* count '<' */
                        if (c == '>') {
                              /* optimize common simple tags */
                              ++FZ2.xsummed;
                              ++FZ2.btotal;
                              ++bp, --bp_len;
                              FZ2.st = DCC_FUZ2_ST_WORD;
                              break;
                        }
                        if (FZ2.tag_len >= 3
                            && !memcmp(FZ2.tag.b, "!--", 3)) {
                              FZ2.st = DCC_FUZ2_ST_SKIP_COMMENT;
                        } else {
                              FZ2.url.st = DCC_URL_ST_IDLE;
                              FZ2.st = DCC_FUZ2_ST_SKIP_TAG;
                        }
                  } else {
                        /* assume it is not an HTML tag and
                         * mark the whole word as junk */
                        JUNK();
                  }
                  break;

            case DCC_FUZ2_ST_SKIP_TAG:
                  /* Skip rest of boring HTML tag
                   * We ought to ignore '>' in quotes */
                  do {
                        if (FZ2.cref_st == DCC_CREF_ST_IDLE) {
                              --bp_len;
                              c = *bp++;
                              if (c == FC_CF) {
                                  FZ2.cref_st = DCC_CREF_ST_START;
                                  continue;
                              }
                        } else {
                              i = adv_cref(cks, *bp);
                              if (i == -2)
                                  continue;
                              if (i < 0) {      /* get more of cref */
                                  --bp_len;
                                  ++bp;
                                  continue;
                              }
                              bp += i;    /* use complete cref */
                              bp_len -= i;
                              c = FZ2.cref_cnt;
                        }

                        /* capture URLs */
                        i = dcc_ck_url(&FZ2.url, c, &FZ2.url_cp);
                        c = i>>DCC_CK_URL_SHIFT;
                        switch ((DCC_CK_URL)(i & DCC_CK_URL_MASK)) {
                        case DCC_CK_URL_CHAR:
                              break;
                        case DCC_CK_URL_CK_LEN:
                              /* Make room if we are too close to
                               * end of buffer for maximum size URL */
                              while (FZ2.url_cp
                                     >= &FZ2.url_buf[ISZ(FZ2.url_buf)
                                          - DCC_FUZ2_URL_MAX]) {
                                  p = memchr(FZ2.url_buf, '\0',
                                           FZ2.url_cp-FZ2.url_buf);
                                  ++p;
                                  memmove(FZ2.url_buf, p,
                                        FZ2.url_cp - p);
                                  FZ2.url_cp -= p - FZ2.url_buf;
                              }
                              if (FZ2.url_cp != FZ2.url_buf)
                                  *FZ2.url_cp++ = '\0';
                              ++FZ2.urls;
                              break;
                        case DCC_CK_URL_HOST:
                        case DCC_CK_URL_DOT:
                              *FZ2.url_cp++ = c;
                              continue;
                        case DCC_CK_URL_HOST_END:
                        case DCC_CK_URL_HOST_RESET:
                        case DCC_CK_URL_SKIP:
                              continue;
                        }

                        if (c == '>') {
                              ++FZ2.xsummed;
                              ++FZ2.btotal;
                              FZ2.st = DCC_FUZ2_ST_WORD;
                              break;
                        }
                        if (cks->mime_cset[c] != FC_SP) {
                              ++FZ2.xsummed;
                              ++FZ2.btotal;
                              /* don't let wild tags run forever */
                              if (++FZ2.tag_len > DCC_URL_FAILSAFE) {
                                  JUNK();
                                  break;
                              }
                        }
                  } while (bp_len != 0);
                  break;

            case DCC_FUZ2_ST_SKIP_COMMENT:
                  /* HTML comments can include HTML tags,
                   * but spammers don't understand HTML comment syntax
                   * and Netscape and IE treat (and ignore) broken
                   * comments like strange tags. */
                  do {
                        --bp_len;
                        c = *bp++;
                        if (c == '>') {
                              ++FZ2.xsummed;
                              ++FZ2.btotal;
                              FZ2.st = DCC_FUZ2_ST_WORD;
                              break;
                        }
                        if (cks->mime_cset[c] != FC_SP) {
                              ++FZ2.xsummed;
                              ++FZ2.btotal;
                              /* don't let wild tags run forever */
                              if (++FZ2.tag_len > DCC_URL_FAILSAFE) {
                                  JUNK();
                                  break;
                              }
                        }
                  } while (bp_len != 0);
                  break;
            }
      }
      for (lbp = lbufs, lp = FZ2.lang; lbp <= LAST(lbufs); ++lbp, ++lp) {
            if (lbp->blen != 0) {
                  lp->wsummed += lbp->blen;
                  FUZ2(lp, lbp->buf, lbp->blen);
            }
      }
#undef SKIP_WORD
#undef JUNK
#undef BUF_LEN
}



void
dcc_ck_fuz2_fin(DCC_GOT_CKS *cks)
{
      FUZ2_LANG *lp, *lp1;

      if (cks->sums[DCC_CK_FUZ2].type != DCC_CK_FUZ2)
            return;

      lp = FZ2.lang;
      for (lp1 = lp+1; lp1 <= LAST(FZ2.lang); ++lp1) {
            if (lp->wtotal < lp1->wtotal)
                  lp = lp1;
      }

#ifdef DCC_CKSUM_DEBUG
      if (dcc_clnt_debug > 3)
            printf("\n***fuz2: wtotal[%d]=%d summed=%d+%d btotal=%d\n",
                   (int)(lp-FZ2.lang),
                   lp->wtotal, lp->wsummed, FZ2.xsummed, FZ2.btotal);
#endif
      /* The FUZ2 checksum is not valid if it is on a few words and
       * less than 10% of a big, binary file */
      if (lp->wtotal < 100
          && (lp->wsummed+FZ2.xsummed)*10 < FZ2.btotal) {
            cks->sums[DCC_CK_FUZ2].type = DCC_CK_INVALID;
            return;
      }
      /* We cannot compute a checksum on a nearly empty message */
      if (lp->wtotal < 8) {
            if (lp->wtotal + FZ2.urls*4 >= 8) {
                  /* use URLs if we lack words */
                  FUZ2(lp, FZ2.url_buf, FZ2.url_cp - FZ2.url_buf);
            } else {
                  /* Compute a checksum for local blacklising on messages
                   * that look empty to the FUZ2 checksum but are not and
                   * are not too binary. The resulting checksum is zero.
                   * Do not report it to the DCC server. */
                  if ((lp->wsummed+FZ2.xsummed) >= 120) {
                        memset(cks->sums[DCC_CK_FUZ2].sum, 0,
                               sizeof(cks->sums[DCC_CK_FUZ2].sum));
                        cks->flags |= DCC_CKS_HAVE_SUM;
                        return;
                  }
                  cks->sums[DCC_CK_FUZ2].type = DCC_CK_INVALID;
                  return;
            }
      }

      MD5Final(cks->sums[DCC_CK_FUZ2].sum, &lp->md5);
      cks->sums[DCC_CK_FUZ2].rpt = 1;
      cks->flags |= DCC_CKS_HAVE_SUM;
}

Generated by  Doxygen 1.6.0   Back to index