package Parp::Mail::Tests::Header;

=head1 NAME

Parp::Mail::Tests::Header - header tests for Parp::Mail objects

=head1 SYNOPSIS

See L<Parp::Mail>.

=head1 DESCRIPTION

This class provides methods for Parp::Mail objects which test
their message bodies for traces of spam.

=cut

use strict;
use warnings;

use Data::Dumper;
use Mail::Field;

use Parp::Blacklist qw(blacklist_lookup);
use Parp::Config qw(config);
use Parp::Friends qw(make_friend is_friend);
use Parp::IdCache;
use Parp::Options qw(opt);
use Parp::Utils qw(diagnose vprint);

=head2 matches($category, $regexp [, $debug])

Scans a category of addresses for addresses matching C<$regexp>.
The following categories are available:

=over 4

=item * ftc

Includes addresses from the envelope, C<From:>, C<X-Mailing-List:>,
C<Sender:>, and C<X-Sender:> headers.

=item * froms

Includes addresses from the envelope and C<From:> headers.

=item * all_froms

Includes addresses from the envelope, C<From:>, C<Reply-To:>,
C<Return-Path:>, C<Message-ID>, C<Sender:>, and C<X-Sender:> headers.

=back

=cut

sub matches {
  my $m = shift;
  my ($category, $re, $debug) = @_;

  my @matches = ();
  foreach my $addr (@{ $m->{addrs}{$category} }) {
    print "Testing $addr =~ /$re/\n" if $debug;
    my $matches = $addr =~ $re;
    if ($matches) {
      print "  -- matched!\n" if $debug;
      push @matches, $addr;
    }
  }

  return @matches;
}

sub is_duplicate {
  my $m = shift;

  return Parp::IdCache::is_duplicate($m->id);
}

sub add_id_to_cache {
  my $m = shift;
  return if opt('test_run') || ! opt('no_dups');
  Parp::IdCache::add_id($m->id);
  diagnose "Added id to duplicates cache.\n";
}

sub is_to_old_address {
  my $m = shift;

  my $found = 0;

  if ($m->to =~ config->old_me) {
    diagnose "*** Old address found:\n  ", $m->to, "\n";
    $found++;
  }

  if ($m->cc =~ config->old_me) {
    diagnose "*** Old address found:\n  ", $m->cc, "\n";
    $found++;
  }

  return $found;
}

sub is_passworded {
  my $m = shift;
  my $password = config->password;
  if ($m->subject =~ $password or
      ($m->header->get(config->password_header) || '') =~ $password) {
    $m->accept_mail('contains good password');
    make_friend($m->from_addr, 'gave password');
    return 1;
  }

  return 0;
}

sub is_from_good_domain {
  my $m = shift;

  if ($m->from =~ config->good_domains &&
      ($m->env_from_addr =~ config->good_domains ||
#       $m->id           =~ config->good_domains ||
       $m->sender        =~ config->good_domains))
  {
    my $good_domain = $1;
    $m->accept_mail('good domain', $good_domain);
    return 1;
  }

  return 0;
}

sub has_good_headers {
  my $m = shift;

  # Could cross-check In-Reply-To: with good domains, but
  # no spammers seem to be setting this header yet, which
  # makes it an even more powerful test.
  if ($m->in_reply_to) {
    $m->accept_mail('had In-Reply-To: header');
    return 1;
  }

  if ($m->refs =~ config->good_domains) {
    $m->accept_mail('References: had good domain', $1);
    return 1;
  }

  if ($m->subject =~ config->subject_buzzwords) {
    $m->accept_mail('subject had buzzword', $1);
    return 1;
  }

  if ($m->mailer =~ /(mutt)/i) {
    $m->accept_mail('good X-Mailer header', $1);
    return 1;
  }

  return 0;
}

sub is_from_good_person {
  my $m = shift;

  foreach my $addr (map { $m->$_() } qw/from_addr env_from_addr/) {
    next unless $addr;
    if (my $reason = is_friend($addr)) {
      $m->accept_mail('from friend', "`$addr' -- $reason");
      return 1;
    }
  }

  return 0;
}

sub has_spam_headers {
  my $m = shift;

  if (my @m = $m->matches('ftc', config->decoys)) {
    $m->{complain} = 0; # don't let them wise up to me subscribing to
                        # stuff using a dud address
    $m->reject_junk_mail('sent to a decoy address', $m[0]);
    $m->deliver_to_inbox('decoyed');
    return 1;
  }

  # Many thanks to Mark-Jason Dominus and to the authors of junkfilter
  # and the NAGS filter for some of the ideas contained herein.

  my $octet_RE = '([12]?\d\d|\d\d|\d)';
  my $ipv4_RE  = ("$octet_RE\\." x 3) . $octet_RE;
  my $foo_RE   = qr![\w.%\#\$+-/]+\*?!;
  if ($m->id !~ m/^<
                      ($foo_RE|"$foo_RE")
                      \@
                      (
                       [\w-]+ (\. [\w-]+){0,6}         |
                       \[ $ipv4_RE \]
                      )
                     >/x) {
    $m->reject_junk_mail('invalid Message-ID: header', "`" . $m->id . "'");
    return 1;
  }

  foreach my $bad_header (qw/PMFLAGS Advertisement X-Advertisement X-Shock/) {
    if ($m->header->get($bad_header)) {
      $m->reject_junk_mail('found bad header', $bad_header);
      return 1;
    }
  }

  my $uidl = $m->header->get('X-UIDL') || '';
  chomp $uidl;
  if ($uidl and $uidl !~ /^([0-9a-f]{32}|.{20})$/i) {
    $m->reject_junk_mail('invalid X-UIDL: header', "`$uidl'");
    return 1;
  }

  if ($m->status =~ /MC/i) {
    $m->reject_junk_mail('MaxAnnon! mailer');
    return 1;
  }

  if (($m->header->get('X-Distribution') || '') =~ /mass/i) {
    $m->reject_junk_mail('bulk mail sent with Pegasus');
    return 1;
  }

  if ($m->from =~ /^(<(_?\@_)?>)$/) {
    $m->reject_junk_mail("bad From: header", "`$1'");
    return 1;
  }

  if ($m->return_path =~ /^(<(_?\@_)?>)$/) {
    $m->reject_junk_mail("bad Return-Path: header", "contained `$1'");
    return 1;
  }

  if ($m->from eq '') {
    $m->reject_junk_mail('From: header is blank or missing');
    return 1;
  }

  if (((my $subj = $m->subject) =~ tr/\x80-\xff//) > 3) {
    $m->reject_junk_mail('Subject: header had too many 8-bit characters');
    return 1;
  }

  if ($m->date =~ m![^\w:,()+/ \t-]!) {
    $m->reject_junk_mail('bad Date: header', "`" . $m->date . "'");
    return 1;
  }

  if ($m->{recvds} =~ /(-0600 \(EST\)|-0[57]00 \(EDT\))/) {
    $m->reject_junk_mail('bad Received: header date', "`$1'");
    return 1;
  }

  if ($m->mailer =~ config->bad_words) {
    $m->reject_junk_mail("bad X-Mailer: header", "contained `$1'");
    return 1;
  }

  if ($m->mailer =~ /[0-9a-f]{10}/i) {
    $m->reject_junk_mail("bad X-Mailer: header", "contained nonsense hex");
    return 1;
  }

  if ($m->{recvds} =~ config->bad_words) {
    $m->reject_junk_mail("bad Received: header", "contained `$1'");
    return 1;
  }

  if ($m->organisation =~ config->bad_words) {
    $m->reject_junk_mail("bad organisation header", "contained `$1'");
    return 1;
  }

  if ($m->precedence =~ /(junk)/i) {
    # Precedence: bulk sometimes set by list software :-(
    $m->reject_junk_mail("bad precedence header", "contained `$1'");
    return 1;
  }

  if (@{ $m->ftc_addrs } > config->max_recipients) {
    $m->reject_junk_mail('too_many_recipients');
    return 1;
  }

  if ($m->subject =~ /\s{8}[0-9]+\s*$/) { # 8 spaces seems to be about right
    $m->reject_junk_mail('Subject: header contained trailing spam id');
    return 1;
  }

  if (config->like_me) {
    my @recipients_like_me = $m->matches('ftc', config->like_me);
    if (@recipients_like_me > config->max_like_me) {
      $m->reject_junk_mail('spam sent to a big alphabetical list');
      return 1;
    }
  }

  if ($m->from_addr =~ /(\@{2,})/) {
    $m->reject_junk_mail('bad From: address', "contained `$1'");
    return 1;
  }

  return 1 if $m->has_spam_received_headers;
  
  return 0;
}

sub has_spam_received_headers {
  my $m = shift;
  $m->parse_received_headers;
  while (my ($recvd, $tree) = each %{ $m->{recvd_parse_trees} }) {
    return 1 if $m->received_header_is_spam($recvd, $tree);
  }
}

sub parse_received_headers {
  my $m = shift;

  return if $m->{recvd_parses_done};

  my $failed_parses_output = '';

  foreach my $recv (@{$m->{recvds_array}}) {
    $recv =~ s/\s*\n\s*/ /gm;

    my $obj = Mail::Field->new('Received', $recv);
    $obj->debug(5);

    if (! $obj->parsed_ok) {
      # Output follows in order ...

      # First, preamble before parser errors
      $failed_parses_output .= <<EOF;
--
Error parsing Received: `$recv'

EOF

      # Then, a reminder of the message details
      $failed_parses_output .= $m->full_from . $m->full_to;
      $failed_parses_output .= $m->full_cc if $m->cc;
      $failed_parses_output .= $m->full_subject . $m->full_id;

      # Finally, the incomplete parse tree
      $failed_parses_output .= Dumper($obj->parse_tree) . "\n";

      $m->{recvd_parses_failed}++;

      $failed_parses_output .= $obj->diagnostics;
    }

    $m->{recvd_parse_trees}{$recv} = $obj->parse_tree;
#    vprint Dumper($obj->parse_tree), "\n";
  }

  $m->{recvd_parses_out} = $failed_parses_output;
  $m->{recvd_parses_done} = 1;
}

sub received_header_is_spam {
  my ($m, $recvd, $tree) = @_;

  my $HELO        = $tree->{from}{HELO}   || '';
  my $from_domain = $tree->{from}{domain} || '';

  if ($HELO =~ /tmpstr/i) {
    $m->reject_junk_mail('bad HELO', $HELO);
    return 1;
  }

# Some bona-fide MTAs give a single sub-domain as the domain.  Bah.
#   if ($HELO                  &&
#       $HELO !~ /\./          &&
#       $HELO ne 'localhost'   &&
#       $from_domain !~ /^$HELO/)
#   {
#     $m->reject_junk_mail('Received header had invalid from domain',
#                          $HELO);
#     return 1;
#   }

  if ($HELO =~ /hotmail\.com/ && $from_domain !~ /hotmail\.com$/) {
    my $whole = $tree->{from}{whole};
    chomp $whole;
    $m->reject_junk_mail('Received header faked as hotmail', $whole);
    return 1;
  }

  return 0;
}

sub has_suspicious_headers {
  my $m = shift;

  if ($m->to eq '' and $m->cc eq '') {
    $m->reject_junk_mail('To: and Cc: headers both blank or missing');
    return 1;
  }

  if ($m->to eq '') {
    $m->reject_junk_mail('To: header blank or missing');
    return 1;
  }

  if ($m->to =~ config->bad_to) {
    $m->reject_junk_mail("bad To: header", "contained `$1'");
    return 1;
  }

  if ($m->subject =~ config->bad_subjects) {
    $m->reject_junk_mail("bad Subject: header", "contained `$1'");
    return 1;
  }

  if (((my $subject = $m->subject) =~ tr/!/!/) >= 5 ||
      $m->subject =~ /!!!!/) {
    $m->reject_junk_mail("Subject: header contained too many exclamation marks");
    return 1;
  }
  
  if ((my @words = $m->subject =~ /\b[A-Z]+\b/g) >= 6) {
    $m->reject_junk_mail('Subject: header had too many all-caps words');
    return 1;
  }

  my @words = split /\s+/, $m->subject;
  my @nonsense_words =
    grep { y/0-9// > 5 and y/A-Z// >= 2 and y/a-z// >= 2 } @words;
  if (@nonsense_words) {
    $m->reject_junk_mail('Subject: header had nonsense word',
                         $nonsense_words[0]);
    return 1;
  }
  
# This one is a bit extreme ...
#  if ($m->subject eq '') {
#    $m->reject_junk_mail('Subject: header is blank or missing');
#    return 1;
#  }

# This one is a bit extreme too ...
#  if ($m->from =~ /^(\d+)\@/ ||
#      $m->from =~ /^(\d+)\@/)
#  {
#    $m->reject_junk_mail('username is all digits', "`$1'");
#    return 1;
#  }

  return 0;
}

sub has_spam_from_addresses {
  my $m = shift;

  if (my @m = $m->matches('all_froms', config->bad_from)) {
    $m->reject_junk_mail('bad from address', "contained `$m[0]'");
    return 1;
  }

  return 0;
}

sub has_spam_domains_anywhere {
  my $m = shift;

  if (my @m = $m->matches('all_froms', config->bad_origins)) {
    $m->reject_junk_mail('bad from/return address', "`$m[0]'");
    return 1;
  }

  if ($m->{recvds} =~ config->bad_origins) {
    $m->reject_junk_mail('bad Received: address', "`$1'");
    return 1;
  }

  $m->parse_received_headers;

  return 1 if opt('do_blacklist') && $m->has_blacklisted_IPs;

  return 0;
}

sub has_blacklisted_IPs {
  my $m = shift;
  
  my $debug = 0;
  my %ips = ();

  foreach my $recv (@{$m->{recvds_array}}) {
    # Avoid various false positives
    $recv =~ s/JetMail \d\.\d\.\d\.\d\b//g;
      
    my @ips = $recv =~ m@(?<!/)\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\b(?!\.)@g;
    @ips = grep { ! exists $ips{$_} } @ips;
    vprint "blacklist check on $recv ...\n" if $debug && @ips;
    foreach my $ip (@ips) {
      $ips{$ip}++;
      vprint "  Checking IP $ip ... " if $debug;
      my $rbl = blacklist_lookup($ip);
      if ($rbl) {
        my $reason = $rbl eq '1' ? $ip : $rbl;
#       if ($reason =~ /(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})/) {
#         $reason = "http://ordb.org/lookup/?host=$1";
#       }
        vprint ''. ($rbl || 'blacklisted') . "\n" if $debug;
        my @reject = ('Blacklisted');
        push @reject, $reason if $reason;
        $m->reject_junk_mail(@reject);
        return 1;
      } else {
        vprint "not found\n" if $debug;
      }
    }
  }

  return 0;
}

sub for_me {
  my $m = shift;
  if ($m->to =~ config->me or $m->cc =~ config->me) {
    return 1;
  }
  $m->reject_junk_mail('not addressed to me');
  return 0;
}

=head1 SEE ALSO

L<Parp::Mail>

=cut

1;
