From 9733ef69a983920fe822c9e57b415c5b6057da87 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sat, 25 Feb 2012 15:46:04 +0100 Subject: Initial import --- .gitignore | 2 + Makefile | 8 + maildirlearn.c | 534 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ spamclassify.py | 128 ++++++++++++++ 4 files changed, 672 insertions(+) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 maildirlearn.c create mode 100755 spamclassify.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f5613ab --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +maildirlearn +*.sw? diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..53c7571 --- /dev/null +++ b/Makefile @@ -0,0 +1,8 @@ +CFLAGS=-Wall -O3 -march=native + +maildirlearn: maildirlearn.c + $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $< + +clean: + $(RM) -f maildirlearn + diff --git a/maildirlearn.c b/maildirlearn.c new file mode 100644 index 0000000..91b66c8 --- /dev/null +++ b/maildirlearn.c @@ -0,0 +1,534 @@ +/* +Copyright (c) 2012 + Lars-Dominik Braun + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* strdup */ +#define _BSD_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* spam status (tri-state + unknown) */ +typedef enum {UNKNOWN, SPAM, HAM, UNSURE} status_t; + +/* linked list */ +struct wdpath { + struct wdpath *next; + /* inotify watch fd */ + int wd; + /* relative path */ + char *path; +}; +typedef struct wdpath wdpath_t; + +/* recursive dir reading, linked list */ +struct notifyDirread { + struct notifyDirread *next; + wdpath_t *wdp; + DIR *dir; +}; +typedef struct notifyDirread notifyDirread_t; + +typedef struct { + /* inotify fd */ + int fd; + + /* wd hash table */ + wdpath_t **tbl; + /* hash table size */ + size_t len; + + /* “root” dir we’re watching */ + char *basedir; + + char buf[sizeof (struct inotify_event)+1024]; + /* can’t use the same buf for real and simulated events */ + char direvbuf[sizeof (struct inotify_event)+1024]; + size_t filled, read; + + /* linked list and last element of linked list (O(1) append) */ + notifyDirread_t *dir, *lastdir; +} notify_t; + +/* watch descriptor to hash + */ +static unsigned int notifyHash (const notify_t *n, const int wd) { + assert (n != NULL); + + return wd % n->len; +} + +/* delete watch descriptor from hashtable + */ +static bool notifyTblDel (notify_t *n, const int wd) { + unsigned int h; + wdpath_t *cur, *prev; + + assert (n != NULL); + + h = notifyHash (n, wd); + cur = n->tbl[h]; + prev = cur; + + while (cur != NULL) { + if (cur->wd == wd) { + if (prev == cur) { + /* remove first entry in list */ + n->tbl[h] = cur->next; + } else { + prev->next = cur->next; + } + printf ("[-] %i, %s\n", wd, cur->path); + free (cur->path); + free (cur); + return true; + } + prev = cur; + cur = cur->next; + } + + return false; +} + +/* add watch descriptor and path to hash table + */ +static wdpath_t *notifyTblAdd (notify_t *n, const int wd, const char *relpath) { + unsigned int h; + wdpath_t *cur; + + assert (n != NULL); + assert (relpath != NULL); + + h = notifyHash (n, wd); + cur = n->tbl[h]; + + if (cur != NULL) { + while (cur->next != NULL) { + if (cur->wd == wd) { + /* already have this one */ + return cur; + } + cur = cur->next; + } + cur->next = malloc (sizeof (*cur->next)); + assert (cur->next != NULL); + cur = cur->next; + } else { + cur = malloc (sizeof (*cur)); + assert (cur != NULL); + n->tbl[h] = cur; + } + + cur->wd = wd; + cur->path = strdup (relpath); + assert (cur->path != NULL); + cur->next = NULL; + + printf ("[+] %i, %s\n", wd, relpath); + + return cur; +} + +/* create watch for relpath (relative to basedir) + */ +static bool notifyAdd (notify_t *n, const char *relpath) { + char path[1024]; + int wd; + wdpath_t *wdp; + + assert (n != NULL); + assert (n->basedir != NULL); + assert (relpath != NULL); + + if (snprintf (path, sizeof (path), "%s%s", n->basedir, relpath) >= sizeof (path)) { + /* truncated */ + return false; + } + + if ((wd = inotify_add_watch (n->fd, path, IN_CREATE | IN_MOVED_TO)) == -1) { + perror ("inotify_add_watch"); + return false; + } + + wdp = notifyTblAdd (n, wd, relpath); + assert (wdp != NULL); + + /* set up recursion, append to list */ + notifyDirread_t *cur; + if (n->lastdir == NULL) { + n->dir = malloc (sizeof (*n->dir)); + cur = n->dir; + } else { + n->lastdir->next = malloc (sizeof (*n->lastdir->next)); + cur = n->lastdir->next; + } + cur->wdp = wdp; + cur->dir = NULL; + cur->next = NULL; + n->lastdir = cur; + + return true; +} + +/* retrieve path from hash table + */ +static const wdpath_t *notifyTblGet (notify_t *n, const int wd) { + unsigned int h; + wdpath_t *cur; + + assert (n != NULL); + + h = notifyHash (n, wd); + cur = n->tbl[h]; + + while (cur != NULL) { + if (cur->wd == wd) { + return cur; + } + cur = cur->next; + } + + return NULL; +} + +/* go to next dir in recursive dir list + */ +static void notifyDirNext (notify_t *n) { + notifyDirread_t *next; + + closedir (n->dir->dir); + + next = n->dir->next; + free (n->dir); + n->dir = next; + if (n->dir == NULL) { + /* this was the last dir in the list */ + n->lastdir = NULL; + } +} + +/* read next event and retrieve event/watch descriptor path structs + */ +static bool notifyRead (notify_t *n, const struct inotify_event **retEvent, + const wdpath_t **retWdp) { + struct inotify_event *event; + const wdpath_t *wdp; + + assert (n != NULL); + assert (retEvent != NULL); + assert (retWdp != NULL); + + while (n->dir != NULL) { + if (n->dir->dir != NULL) { + /* continue reading open dir */ + struct dirent *dent; + + while (true) { + dent = readdir (n->dir->dir); + if (dent != NULL) { + struct stat sb; + char fullpath[1024]; + + if (strcmp (dent->d_name, "..") == 0 || strcmp (dent->d_name, ".") == 0) { + continue; + } + + assert (n->dir->wdp != NULL); + assert (n->dir->wdp->path != NULL); + if (snprintf (fullpath, sizeof (fullpath), "%s%s%s", n->basedir, + n->dir->wdp->path, dent->d_name) >= sizeof (fullpath)) { + /* overflow */ + assert (0); + continue; + } + + if (stat (fullpath, &sb) == -1) { + perror ("stat"); + continue; + } + if (!S_ISDIR (sb.st_mode)) { + continue; + } + + /* simulate create dir event */ + event = (struct inotify_event *) n->direvbuf; + event->wd = n->dir->wdp->wd; + event->mask = IN_CREATE | IN_ISDIR; + strncpy (event->name, dent->d_name, sizeof (n->direvbuf)-sizeof (*event)-1); + event->len = strlen (dent->d_name)+1; + + *retEvent = event; + *retWdp = n->dir->wdp; + return true; + } else { + notifyDirNext (n); + break; + } + /* never reached */ + assert (0); + } + } else { + char fullpath[1024]; + + assert (n->dir != NULL && n->dir->wdp != NULL && n->dir->wdp->path != NULL); + + snprintf (fullpath, sizeof (fullpath), "%s%s", n->basedir, + n->dir->wdp->path); + n->dir->dir = opendir (fullpath); + if (n->dir->dir == NULL) { + notifyDirNext (n); + } + } + } + + if (n->read >= n->filled) { + ssize_t ret; + + if ((ret = read (n->fd, n->buf, sizeof (n->buf))) == -1) { + perror ("read"); + return false; + } + + n->read = 0; + n->filled = ret; + } + + event = (struct inotify_event *) (n->buf + n->read); + n->read += sizeof (*event)+event->len; + + wdp = notifyTblGet (n, event->wd); + if (wdp == NULL) { + printf ("no wdp\n"); + return false; + } + + *retEvent = event; + *retWdp = wdp; + + return true; +} + +/* initialize notify struct: obtain inotify fd, add watch for basedir + */ +static bool notifyInit (notify_t *n, const size_t len, const char *basedir) { + assert (n != NULL); + assert (basedir != NULL); + + memset (n, 0, sizeof (*n)); + + n->len = len; + n->tbl = calloc (n->len, sizeof (*n->tbl)); + n->basedir = strdup (basedir); + + if ((n->fd = inotify_init ()) == -1) { + perror ("inotify_init"); + return false; + } + + return notifyAdd (n, ""); +} + +/* get current bogofilter status, get desired status from path (via regex), + * set new bogofilter status + */ +static bool runBogofilter (const char *bogopath, const regex_t spamdirre, + const char *path) { + status_t curStatus = UNKNOWN, newStatus = UNKNOWN; + pid_t pid; + + /* get current status */ + pid = fork (); + if (pid == -1) { + perror ("fork"); + return false; + } else if (pid == 0) { + /* child */ + if (execl (bogopath, bogopath, "-I", path, (char *) NULL) == -1) { + perror ("execl"); + return false; + } + /* never reached */ + assert (0); + } else { + int status; + + if (waitpid (pid, &status, 0) == -1) { + perror ("waitpid"); + return false; + } else { + /* translate bogofilter exit status to internal status */ + switch (WEXITSTATUS (status)) { + case 2: + printf ("curStatus=unsure\n"); + curStatus = UNSURE; + break; + + case 1: + printf ("curStatus=ham\n"); + curStatus = HAM; + break; + + case 0: + printf ("curStatus=spam\n"); + curStatus = SPAM; + break; + + default: + /* invalid status */ + return false; + break; + } + } + } /* end if fork() */ + + /* user decided this is spam? */ + if (regexec (&spamdirre, path, 0, NULL, 0) == 0) { + /* match */ + printf ("new status: spam\n"); + newStatus = SPAM; + } else { + printf ("new status: ham\n"); + newStatus = HAM; + } /* end if regex */ + + /* set new status */ + if (curStatus != newStatus) { + const char *bogoopts = NULL; + + if (curStatus == UNSURE) { + if (newStatus == HAM) { + bogoopts = "-n"; + } else if (newStatus == SPAM) { + bogoopts = "-s"; + } + } else if (curStatus == SPAM && newStatus == HAM) { + bogoopts = "-Sn"; + } else if (curStatus == HAM && newStatus == SPAM) { + bogoopts = "-Ns"; + } else { + assert (0); + } + + pid = fork (); + if (pid == -1) { + perror ("fork"); + } else if (pid == 0) { + /* child */ + if (execl (bogopath, bogopath, bogoopts, "-I", path, + (char *) NULL) == -1) { + perror ("execl"); + return false; + } + /* never reached */ + assert (0); + } else { + int status; + + if (waitpid (pid, &status, 0) == -1) { + perror ("waitpid2"); + return false; + } else { + printf ("bogofilter returned %i\n", WEXITSTATUS (status)); + } + } + } /* end if curStatus != newStatus */ + + return true; +} + +int main (int argc, char **argv) { + regex_t spamdirre, excludere; + const char bogopath[] = "bogofilter"; + /* with '/' postfix */ + const char watchdir[] = "mail/"; + int running = 1; + notify_t n; + + /* setup */ + notifyInit (&n, 128, watchdir); + + if (regcomp (&spamdirre, "mail/\\.Junk/", REG_EXTENDED) != 0) { + printf ("invalid spamdir re\n"); + } + + if (regcomp (&excludere, "mail/(\\.Unsure/|[^/]+/(tmp|.*:2,[A-S]*T[U-Z]*$)|.*dovecot)", REG_EXTENDED) != 0) { + printf ("invalid exclude re\n"); + } + + while (running) { + const struct inotify_event *event; + const wdpath_t *wdp; + + if (notifyRead (&n, &event, &wdp)) { + char fullpath[1024], *relpath; + + assert (event != NULL); + assert (wdp != NULL); + + /* a word of warning: don’t use event->name if event->len is 0! */ + if (snprintf (fullpath, sizeof (fullpath), "%s%s%s", n.basedir, + wdp->path, (event->len == 0) ? "" : event->name) >= sizeof (fullpath)) { + /* overflow */ + assert (0); + continue; + } + relpath = fullpath + strlen (n.basedir); + + /* is path excluded? */ + if (regexec (&excludere, fullpath, 0, NULL, 0) == 0) { + continue; + } + + //printf ("[!] full: %s\n", fullpath); + + if (event->mask & IN_IGNORED || event->mask & IN_DELETE_SELF) { + /* watch was removed */ + notifyTblDel (&n, event->wd); + } else if (event->mask & IN_ISDIR) { + /* FIXME: insane strncat */ + strncat (fullpath, "/", sizeof (fullpath)-strlen (fullpath)-1); + notifyAdd (&n, relpath); + } else { + runBogofilter (bogopath, spamdirre, fullpath); + } + } else { + printf ("notifyRead failed\n"); + } + } + + regfree (&spamdirre); + regfree (&excludere); +} diff --git a/spamclassify.py b/spamclassify.py new file mode 100755 index 0000000..7c40c27 --- /dev/null +++ b/spamclassify.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python + +# Copyright (c) 2012 +# Lars-Dominik Braun +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +import os, re, sys, subprocess, pyinotify + +class Classifier (object): + # files in junk are spam + SPAMDIR = re.compile (r'mail/\.Junk/(new|cur)') + # exclude: Unsure messages, temporary directory, trashed messages (see + # http://cr.yp.to/proto/maildir.html), dovecot files + EXCLUDE = re.compile (r'mail/(\.Unsure/(new|cur)|[^/]+/(tmp|.*:2,[A-S]*T[U-Z]*$)|.*dovecot)') + + SPAM = 1 + HAM = 2 + UNSURE = 3 + + def statusFromFile (self, path): + """ + filter mail through bogofilter and return spam status + """ + + ret = subprocess.call (['bogofilter', '-I', path]) + if ret == 2: + return self.UNSURE + elif ret == 1: + return self.HAM + elif ret == 0: + return self.SPAM + elif ret == 3: + raise Exception ('bogofilter returned error') + + def statusFromPath (self, path): + """ + get spam status from file path + """ + + if self.SPAMDIR.search (path): + return self.SPAM + else: + return self.HAM + + def isExcluded (self, path): + """ + is path excluded? + """ + + return self.EXCLUDE.search (path) + + def setStatus (self, curStatus, newStatus, path): + """ + tell bogofilter to set new spam status for message + """ + if curStatus != newStatus: + bogoopts = None + if curStatus == self.UNSURE: + if newStatus == self.HAM: + bogoopts = '-n' + elif newStatus == self.SPAM: + bogoopts = '-s' + elif curStatus == self.SPAM and newStatus == self.HAM: + bogoopts = '-Sn' + elif curStatus == self.HAM and newStatus == self.SPAM: + bogoopts = '-Ns' + print 'bogoopts %s' % bogoopts + ret = subprocess.call (['bogofilter', bogoopts, '-I', path]) + print 'bogofilter returned %i' % ret + return True + else: + return False + + + def updateStatus (self, path): + """ + does all the magic + """ + if self.isExcluded (path): + return False + curStatus = self.statusFromFile (path) + newStatus = self.statusFromPath (path) + return self.setStatus (curStatus, newStatus, path) + +class EventHandler(pyinotify.ProcessEvent): + def __init__ (self, wm, mask, classifier): + pyinotify.ProcessEvent.__init__ (self) + + self.wm = wm + self.mask = mask + self.classifier = classifier + + def process_IN_MOVED_TO (self, event): + self.classifier.updateStatus (event.pathname) + + def process_IN_CREATE (self, event): + # make sure new directories are watched as well + if event.mask & pyinotify.IN_ISDIR: + wdd = self.wm.add_watch(event.pathname, self.mask, rec=True) + else: + self.classifier.updateStatus (event.pathname) + +if __name__ == '__main__': + c = Classifier () + wm = pyinotify.WatchManager() + mask = pyinotify.IN_MOVED_TO | pyinotify.IN_CREATE + handler = EventHandler(wm, mask, c) + notifier = pyinotify.Notifier(wm, handler) + wdd = wm.add_watch(os.path.expanduser ('mail/'), mask, rec=True) + notifier.loop() + -- cgit v1.2.3