#!/usr/bin/env perl
# /Time-stamp: "2008-09-15 16:29:50 leycec"/

=head1 NAME

afallen - migrates, synchronizes, filters, and otherwise munges IMAP-accessible
          e-mail.

=head1 TODO

=over 4

=item Add "imapsync" support for OfflineIMAP-alternative IMAP-to-IMAP
      synchronization.

=back

=cut

# ....................{ INITIALIZATION                     }....................
use strict;
use warnings;
use lib "$ENV{HOME}/.perl/lib/";

# ....................{ CONSTANTS                          }....................
use constant {
  true => '1',
  false=> '',

  success=> 0,
  failure=> 1,

  timeout_after_half_a_minute=> [32],

  # Is this mail spam, ham (non-spam), or neither? If neither, then we are not
  # certain that it's verifiably one or the other. (It's quite critical to be
  # certain of things like this.)
  #
  # We assign one (and only one) of the following three values this mail, later.
  unsure=> 0,
  ham=>    1,
  spam=>   2,
}
;

# ....................{ SUBROUTINES                        }....................
sub is_installed($);
sub utter($);
sub utter_date_and($);

sub test_mail_against($);
sub pass_mail_to($);
sub pipe_mail_to($);
sub feed_mail_to($);

sub train();
sub sync();
sub sync_filter();
sub sync_filter_from_formail_to_procmail_to_afallen($);
sub test();

# ....................{ USER VARIABLES                     }....................
my $is_logging = true;

my $spam_response = <<HUBRIS;
 We regret to inform you that we regret receiving your recent message.

 We have marked this message as spam; if you feel that we are unfeeling,
 feckless pigs, and made this marking in error, and that you are--indeed--no
 fickle, sicker spammer, you may reply to this rejection with a wrathful
 intent, wrought of ego, and you rings of apprentice ire.

 Retiring, I cleave with power:

 The aisles of property--which creak about our Tonal, atonally cleft from it--
 are awfully unwelcome in this inbox.

   Thank you.

   splash On.
HUBRIS

# ....................{ MAILDIR PATHS                      }....................
# Maildir Maildir names, here, should not include the Mairdir-ish prefix ".".
# (The external "maildirmake" behaves poorly, otherwise.)

# Local Mairdir root.
my $maildir_root = "$ENV{HOME}/var/mail/gmail";

# Local Maildir to which we move mail classified as spam.
my $spam_maildir = 'dead';

# Local Maildir to which we move mail classified as ham (non-spam).
my $ham_maildir = 'life';

# Local Maildir to which we move mail classified as neither ham or spam, and
# matching no recognized white-list pattern.
my $unsure_maildir = 'read.not';

my $item_maildir_prefix = 'life.';
my $list_maildir_prefix = 'list.';

my $emergency_mail_file = "$maildir_root/.$unsure_maildir/new/emergency";

# ....................{ PROCMAIL PATHS                     }....................
# Local procmail-rc file.
my $procmailrc_file = "$ENV{HOME}/.procmailrc";

# Local procmail logfile.
my $procmail_log_file = "$ENV{HOME}/var/log/procmail";

# ....................{ INSTALLED PROGRAMS                 }....................
# Is bogofilter, a "fast Bayesian spam filter," installed?
my $is_installed_bogofilter = is_installed('bogofilter');

# Is procmail, an "autonomous mail processor," installed?
my $is_installed_procmail = is_installed('procmail');
my $is_installed_formail  = is_installed('formail');

# ....................{ TIME DEPENDENCIES                  }....................
use POSIX qw(strftime);

my @time_list = localtime;
my $date_long  = POSIX::strftime("%a %b %e %H:%M:%S %G", @time_list);
my $date_short = POSIX::strftime("%G-%m-%d %H:%M:%S",    @time_list);

# ....................{ COMMAND-LINE ARGUMENTS             }....................
if (defined $ARGV[0]) {
  my $sub_name = shift @ARGV;
  my %sub_name_to_sub = (
    sync=>        \&sync,
    sync_filter=> \&sync_filter,
    test=>        \&test,
    train=>       \&train,
  );

  if (defined $sub_name_to_sub{$sub_name}) {
              $sub_name_to_sub{$sub_name}->();
    exit success;
  }
  else { die "'$sub_name' not a recognized command!"; }
}

# ....................{ MAIL DEPENDENCIES                  }....................
use Email::Filter;
use File::Temp;

# Read one e-mail from "stdin".
my $mail = defined $ARGV[0]
  ? Email::Filter->new(emergency=> $emergency_mail_file, data=> qx{cat $ARGV[0]})
  : Email::Filter->new(emergency=> $emergency_mail_file)
  ;

# ....................{ MAIL HEADERS                       }....................
sub dechomp($) { my $string = shift; $string ? $string."\n" : '' }

my $mail_is =
   dechomp($mail->from)
  .dechomp($mail->to)
  .dechomp($mail->cc)
  .dechomp($mail->bcc)
  .dechomp($mail->header('Sender'))
  .dechomp($mail->header('List-Id'))
  ;

# Is this mail spam, ham (non-spam), or neither? We ascertain this, later.
my $mail_bogosity = unsure;

# ....................{ WHITELIST                          }....................
my $white_list = [
  # Mail contact white list.
  sub { $mail_is =~ m~
      marycurry1\@juno\.com
    | dcurry\@uoregon\.edu | daveysnothere\@yahoo\.com
    | dvaferovich
    ~ix && $item_maildir_prefix.'us.los_angeles' }
, sub { $mail_is =~ m~
      peri\@logorrhea\.com
    | lar\@wezen\.net
    ~ix && $item_maildir_prefix.'us.san_luis_obispo' }
, sub { $mail_is =~ m~
      etsuko     | shigemi
    | anchanx\@hotmail\.com
    | shiawase2002\@yahoo\.com
    | pyoung\@longwoodgardens\.org
    ~ix && $item_maildir_prefix.'jp.tokyo' }
, sub { $mail_is =~ m~
      flymighty\@yahoo\.com
    | alingelb\@yahoo\.com
    ~ix && $item_maildir_prefix.'tz.morogoro' }
, sub { $mail_is =~ m~
      xavserrato\@gmail\.com
    | rpipkin\@gmail\.com
    | heiho1\@mac\.com
    ~ix && $item_maildir_prefix.'us.new_york' }
, sub { $mail_is =~ m~
      jsn\@j-s-n\.org | jason\.dusek\@gmail\.com
    ~ix && $item_maildir_prefix.'us.san_francisco' }
, sub { $mail_is =~ m~ \.govt\.nz
    | alexis\.pietak\@gmail\.com
    | bruce\.scanlon\@gmail\.com | kaguidi\@yahoo\.com
    | adam_galahad\@yahoo\.com
    | jon\@utensil\.co\.nz
    ~ix && $item_maildir_prefix.'nz.christchurch' }

   # Mailing list white list.
,  sub {  $mail_is =~ m~kensanata~ix && $list_maildir_prefix.'oddmuse' }
,  sub { ($mail_is =~ m~
      clielasus\@gmail\.com
    | drtune\@gmail.com
    | sayajay\.net
    | the\.ajp\@gmail\.com
     ~ix || $mail->from =~ m~
       mobilegreetings\.com | mgstaff\.com
     ~ix) && $list_maildir_prefix.'schmobile' }
,  sub { $mail->from =~ m~nearlyfreespeech\.net~i && $list_maildir_prefix.'nfsn' }
,  sub { $mail->from =~ m~sdf\.lonestar\.org~i    && $list_maildir_prefix.'sdf' }
,  sub { $mail->from =~ m~
      asb\.com | bankofamerica\.com | citibank\.com | paypal\.com
    | hsbc\.com | hsbcusa.\com
    | elynchfx\.com | elldridge_Lynch
    | ebay\.com | trademe\.co\.nz
    | libraryelf\.com
    ~ix && $list_maildir_prefix . 'avarice' }
]
;

# ....................{ GUESS LIST                         }....................
# Ham (non-spam) matching no pattern in the white list, above, is tested
# against this guess list of pattern matches. This is a last-ditch, dire
# attempt to filter unrecognized ham into the most appropriate Maildir for that
# ham.
#
# Ham for job interviews and corporate sales, for example, have "Subject: "
# headers often having red-flag, signal words with which we easily match:
# "interview," "order", "conspicuously capitalistic consumptions," and so on.
# (I've named my maildir for such ham ".list.avarice"; your mileage may vary.)
my $guess_list = [
  # Mailing list guess list.
  sub { $mail->subject =~ m~
      job | interview | resume
    | developer | engineer
    | opportunity | position

    | auction | trademe
    | confirmation | reservation
    | order | invoice
    | shipment | shipped
    | tracking
    ~ix && $list_maildir_prefix . 'avarice' }
]
;

# ....................{ MAIL FILTER                        }....................
utter 'received mail from stdin...';
#utter '...on '.POSIX::strftime("%a %b %e %H:%M:%S %G", localtime);
#utter '...by "'.$mail->from.'".';
#utter '...to "'.$mail->to.'".';
#utter '...of \''.$mail->subject.'\'.';

# Prevent intermediary "pipe," "accept," or "reject" calls from exiting. (Why?
# Because we exit on the final "accept.")
$mail->exit(false);

# If we recognize this mail's sender (by finding that sender in our white-list),
# move this mail into a Maildir specific to that sender.
utter "testing against white list...";
test_mail_against $white_list;

# If "bogofilter" is installed, pipe this mail through it to mark this mail as
# spam, ham, or neither. Necessarily, this destroys and reconstructs a new
# "Email::Simple" object embodying the mail (since "bogofilter" adds a new
# header to that mail.)
if ($is_installed_bogofilter) {
  utter "passing through bogofilter...";
  $mail->simple(Email::Simple->new(feed_mail_to("bogofilter -uepI")));

      my $mail_bogosity_header = $mail->header("X-Bogosity");
     if ($mail_bogosity_header =~ m~^Yes|Spam~) { $mail_bogosity = spam; }
  elsif ($mail_bogosity_header =~ m~^No|Ham~  ) { $mail_bogosity = ham; }
  utter " passed through bogofilter... [$mail_bogosity/$mail_bogosity_header]";
}

# If this mail is spam, move it to a spam-specific Maildir; otherwise, if this
# mail is ham non-spam, move it to a ham-specific Maildir.
   if ($mail_bogosity == spam) { pass_mail_to $spam_maildir; }
elsif ($mail_bogosity == ham) {
  # If we did not recognize this mail's sender (by not finding that sender in
  # our white-list, above), but recognize this mail as ham (non-spam), try to
  # heuristically move this mail into a Maildir specific to the content of that
  # mail (by simple pattern-matching on that mail's body.) This is a poor
  # heuristic, but helps in auto-moving mail into "likely candidate" Maildirs.
  utter "testing against guess list...";
  test_mail_against $guess_list;
}

# Accept this mail into a sender-agnostic Maildir, if all else fails.
utter "passing mail to the catch-all maildir...";
pass_mail_to $unsure_maildir;

# ....................{ MAIL UTILITIES                     }....................
# Print the passed string to standard out, if logging. ("Standard out," here, is
# often the log-file for the Mail Delivery Agent (MDA) that called us--often,
# procmail, maildrop, or Cyrus Sieve.)
sub utter($) {
  my $utterance = shift || '[UNDEFINED]';
  $is_logging && warn "afallen: $utterance\n";
}

sub utter_date_and($) {
  my $utterance = shift || '[UNDEFINED]';
  $is_logging && warn "[$date_short] afallen: $utterance\n";
}

# Return "true," if the passed string is a program installed on this machine.
sub is_installed($) {
  my $program_name = shift;
  return system(qq{which $program_name 1> /dev/null 2>&1}) == success;
}

# ....................{ MAIL MATCHING                      }....................
sub test_mail_against($) {
  my $maildir_matchers = shift;
  my $maildir;
   
  foreach my     $maildir_matcher (@$maildir_matchers) {
    $maildir = &$maildir_matcher;
    $maildir && pass_mail_to $maildir;
  }
}

# ....................{ MAIL ACCEPTANCE                    }....................
# Accept this mail into the passed Maildir-style Maildir.
sub pass_mail_to($) {
  my $maildir = shift;
  my $maildir_path = "$maildir_root/.$maildir/";

  # Make the mail's Maildir, if it does not exist
  if (! -e $maildir_path) {
    system("maildirmake -f $maildir $maildir_root") == success ||
       die "Could not make maildir '$maildir_path', since: $!"
  }

  # Mark this mail as spam or ham, if not already marked as that and moving to a
  # spam- or ham-specific Maildir.
  if ($mail_bogosity == unsure && $maildir ne $unsure_maildir) {
    if ($maildir eq $spam_maildir) {
      utter "marking mail as spam...";
      $is_installed_bogofilter && feed_mail_to("bogofilter -sI");
    }
    else {
      utter "marking mail as ham...";
      $is_installed_bogofilter && feed_mail_to("bogofilter -nI");
    }
  }

  # Pass the mail into its Maildir                   
  utter "passing mail to '$maildir_path'...";
  warn "\n";

  # Exit this script on executing the following "accept" subroutine.
  $mail->exit(true);
  $mail->accept($maildir_path);
}

# ....................{ MAIL PIPING                        }....................
# Pipe this mail as stdin to an external program, and return that program's
# stdout as a string from this subroutine. Sadly, the "Email::Filter->pipe"
# subroutine does not run as advertised on my machine. (It fails silently,
# without performing the pipe.) Thusly, this subroutine serves a simple--albiet
# somewhat less robust--replacement for that subroutine.
sub pipe_mail_to($) {
  my $command = shift;

  my $temp_mail_file_handle = File::Temp->new(UNLINK=> true);
  my $temp_mail_file = $temp_mail_file_handle->filename;

  print $temp_mail_file_handle $mail->simple->as_string;

  my $pipe_command = qq{cat "$temp_mail_file" | $command};
  my $command_output = qx{$pipe_command};

  $? == success || die "Cannot pipe mail via '$pipe_command', since: $!";
  return $command_output;
}

sub feed_mail_to($) {
  my $command = shift;

  my $temp_mail_file_handle = File::Temp->new(UNLINK=> true);
  my $temp_mail_file = $temp_mail_file_handle->filename;

  print $temp_mail_file_handle $mail->simple->as_string;

  my $pipe_command = qq{$command "$temp_mail_file"};
  my $command_output = qx{$pipe_command};

  $? == success || die "Cannot pipe mail via '$pipe_command', since: $!";
  return $command_output;
}

# ....................{ MAIL SYNCHRONIZATION               }....................
# Synchronizes IMAP mail into one (or several) local IMAP Maildir repositories
# from one (or several) remote IMAP mail servers; and, for each mail
# synchronized into that local IMAP Maildir repository with new mail (the
# "INBOX"), explicitly calls procmail on that mail to filter that mail into
# the local IMAP Maildir repository to which it belongs. (Mail addressed from
# "wage.slave@the.corporation.com", for example, might be filtered into the
# the local IMAP Maildir repository named "~/Maildir/.working.class.hero/".)
sub sync() {
  is_installed('offlineimap') ||
    die "Cannot synchronize, since OfflineIMAP is not installed!";

  # Perform the synchronization.
  utter_date_and "synchronizing via OfflineIMAP...";
  system(qq{nice offlineimap}) == success ||
    die "Cannot synchronize via OfflineIMAP, since: $!";

  # Perform the filtering. ('offlineimap' does not invoke our Mail Delivery Agent
  # (MDA). As such, we do.)
  sync_filter;
}

# OfflineIMAP does not invoke our Mail Delivery Agent (MDA). As such, we do.
sub sync_filter() {
                       my $unsure_maildir_path = "$maildir_root/.$unsure_maildir";
  for my $new_mail (glob "$unsure_maildir_path/new/*") {
    sync_filter_from_formail_to_procmail_to_afallen $new_mail;

    # Delete the original new mail, now that we've copied it elsewhere. (If we've
    # copied it back into the maildir from which it came, we still delete the
    # original. The copy is guaranteed to have a unique name. In general, this is
    # not problematic.)
    unlink($new_mail) || die "Cannot delete '$new_mail', since: $!";
  }
}

sub sync_filter_from_formail_to_procmail_to_afallen($) {
  my $mail_file = shift;

  utter_date_and "synchronize-filtering '$mail_file' through procmail...";

  ($is_installed_formail and $is_installed_procmail) ||
    die "Cannot synchronize-filter, since 'formail' and/or procmail are not installed!";

  system(qq{
    nice cat "$mail_file" | formail -q- -s procmail -tm "$procmailrc_file"
  }) == success ||
    die "Cannot synchronize-filter '$mail_file' through procmail, since: $!";
}

# ....................{ MAIL TRAINING                      }....................
# Trains bogofilter on spam- and ham-designated mail already synchronized into
# local Maildirs.
sub train_bogofilter($$) {
  my $bogofilter_options = shift;
  my $training_maildir_paths = shift;

  for my $training_maildir_path (@$training_maildir_paths) {
      -d $training_maildir_path ||
      die "Cannot train; '$training_maildir_path' not a directory!";

    utter "training 'bogofilter $bogofilter_options' on '$training_maildir_path'...";
    system(qx{bogofilter -evMB $bogofilter_options "$training_maildir_path"});
  }
}

sub train() {
  if ($is_installed_bogofilter) {
    # Train bogofilter to recognize all mail in the spam Maildir as spam.
    train_bogofilter '-s', [ "$maildir_root/.$spam_maildir" ];

    # Train bogofilter to recognize all mail in all Maildirs explicitly classified
    # as neither spam or unsure, above, as ham. (This presumes the majority of the
    # Maildirs under the Maildir root have ham. If this is not the case, this
    # training algorithm will be of little use.)
    my $ham_maildirs = [
      grep { -d $_ && !/(\.|\..|\Q$spam_maildir\E|\Q$unsure_maildir\E)$/ }
      glob "$maildir_root/.*"
    ];
    train_bogofilter '-n', $ham_maildirs;
  }
  else { die "Cannot train, since bogofilter is not installed!"; }
}

# ....................{ MAIL TESTING                       }....................
sub test() {
  my $afallen_file;

  if (           -f "$ENV{PWD}/afallen") {
    $afallen_file = "$ENV{PWD}/afallen";
  }
  else {
    $afallen_file = qx{which afallen};
    chomp $afallen_file;
  }

  utter "testing '$afallen_file' syntax...";
  system(qq{perl -c "$afallen_file"}) == success ||
    die "Cannot test '$afallen_file' syntax, since: $!";

  if (-e ".procmaillock") {
    utter "forcefully removing '.procmaillock'...";
    unlink ".procmaillock" ||
      die "Cannot forcefully remove '.procmaillock', since: $!";
  }

  if ($is_logging) {
    system(qq{touch "$procmail_log_file"}) == success ||
      die "Cannot 'touch $procmail_log_file', since: $!";
    unlink $procmail_log_file ||
      die "Cannot unlink $procmail_log_file, since: $!";
  }

  # FIXME: Generalize. Blagh!
  defined($ARGV[0]) or
     push(@ARGV, glob('~/bin/afallen/test_mails/spam.mess'));

  foreach my $mail_file (@ARGV) {
    sync_filter_from_formail_to_procmail_to_afallen $mail_file;
  }

  if ($is_logging) {
    sleep 1;
    system(qq{cat "$procmail_log_file"}) == success ||
      die "Cannot 'cat $procmail_log_file', since: $!";
  }
}

  1
__END__

=head1 COPYRIGHT AND LICENSE

The information below applies to everything in this distribution,
except where noted.

Copyleft 2006-2008 by B.w.Curry.

  http://www.raiazome.com

This file is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This file is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this file; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

=cut
