# Duplicates based on message-ID

# Blank message-IDs are common on spam, so the dupe check
# redirects a lot of those. I'd like to seperate that special
# case out.

:0
* ! ^X-Loop: *no-messid-check@qz.little-neck.ny.us
{
  :0:
  * ^Message-ID:[ 	]*<?>?[ 	]*$
  $NOTES/spam-output

  LOCKFILE=msgid.lock

  :0:
  * ? formail -D 32768 msgid.cache
  $NOTES/duped-mail

  LOCKFILE
}

# Calculate some sizes for use in recipes

  # "B"ody only. Headers vary too much. Using ":0B" doesn't work, hence
  # this weird syntax. It is so easy to hate procmail for stuff like this.
  # Add this "* 1^0 ^^" to fix an off by one size problem, that does not
  # bother me, though.
  :0
  * B ?? 1^1 ^.*$
  { }
  Lines     = $=

  :0
  * B ?? 1^1 [^ 	]+([ 	]+|$)
  { }
  Words    = $=

  :0
  * B ?? 1^1 > 1
  { }
  Chars    = $=

# Other stuff used for these recipes

  # Return-path, drop first space
  :0
  * ^Return-Path: \/.*
  { Path=$MATCH }

  # From: cannot be counted on to have a leading space
  :0
  * ^From:\/.*
  { From=$MATCH }

  # Subject: cannot be counted on to have a leading space
  :0
  * ^Subject:\/.*
  { Subject=$MATCH }

# Ignore ping / policy requests plus Internet Oracle replies for
# these tests. Further, ignore stuff I send myself.
:0
* ! ^(To:.*(request.*|policy)@qz|Subject: The Oracle |From .*!(eli|jamin) )
{

  # Duplicates, as determined by a metric of From: and Subject:
  # contents plus message size (in words). This one is designed
  # to catch spammers that send the same thing every few days.
  # Use a line count to allow some flexibility in size, but keep
  # a large portion of the from and subject.

  LOCKFILE=mymtrcmail.lock

  # Order is selected to minimize long line truncation problems.
  Metric="$Words$From$Subject"

  # -s N  size of dbfile (# of hashes, not byte size as with formail)
  # -l N  length of line significant (not a good idea to change after
  #		creating the db file)
  # No -t will speed up hashd when matches are found at the cost of
  # increased chances of oft-repeating stuff falling off the bottom.
  :0
  * ? hashd mymtrcmail.cache -s 900 -l 200 "$Words$From$Subject"
  {
    :0fhw
    | formail -I"Words-From-Subject: $Metric"

    :0:
    $NOTES/duped-mail
  }

  LOCKFILE

  # Duplicates, as determined by a metric of and Subject:
  # plus message size in bytes. This one is designed to
  # catch real brain-dead send-the-same-thing-over-and-over
  # type spams. I'm less concerned about truncation, and
  # more concerned with file size.

  LOCKFILE=subsize.lock

  # Order is selected to minimize long line truncation problems.
  Metric="$Chars$Subject"

  # -s N  size of dbfile (# of hashes, not byte size as with formail)
  # -l N  length of line significant (not a good idea to change after
  #		creating the db file)
  # -t    cycle-matches-to-top
  :0
  * ? hashd subsize.cache -t -s 10 -l 100 "$Chars$Subject"
  {
    :0fhw
    | formail -I"Chars-Subject: $Metric"

    :0:
    $NOTES/duped-mail
  }

  LOCKFILE

}

# Only run on news stuff
:0
* ^To: *alt-sex-stories-moderated@
{

  LOCKFILE=pathsize.lock

  # -s N  size of dbfile (# of hashes, not byte size as with formail)
  # -l N  length of line significant (not a good idea to change after
  #		creating the db file)
  # -t    cycle-matches-to-top
  :0
  * ? hashd pathsize.cache -t -s 50 -l 50 "$Words$Path"
  {
    :0fhw
    | formail -I"Words-Path: $Metric"

    :0:
    $NOTES/duped-mail
  }

  LOCKFILE

  # Order is selected to minimize long line truncation problems.
  Metric="$Lines$From$Subject"

  # Duplicates, as determined by a metric of From: and Subject:
  # contents plus message size (in lines). This one is designed
  # to catch spammers that send the same thing every few days.
  # Use a line count to allow some flexibility in size, but keep
  # a large portion of the from and subject.

  LOCKFILE=mymtrc.lock

  # -s N  size of dbfile (# of hashes, not byte size as with formail)
  # -l N  length of line significant (not a good idea to change after
  #		creating the db file)
  # No -t will speed up hashd when matches are found at the cost of
  # increased chances of oft-repeating stuff falling off the bottom.
  :0
  * ? hashd mymtrc.cache -s 1200 -l 200 "$Lines$From$Subject"
  {
    :0fhw
    | formail -I"Lines-From-Subject: $Metric"

    :0:
    $NOTES/duped-mail
  }

  LOCKFILE

}