aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2012-02-25 15:46:04 +0100
committerLars-Dominik Braun <lars@6xq.net>2012-02-25 15:46:04 +0100
commit9733ef69a983920fe822c9e57b415c5b6057da87 (patch)
treed026b3f5ca3823d1f4bbaefb3d2a723ad7b99619
downloadmaildirlearn-9733ef69a983920fe822c9e57b415c5b6057da87.tar.gz
maildirlearn-9733ef69a983920fe822c9e57b415c5b6057da87.tar.bz2
maildirlearn-9733ef69a983920fe822c9e57b415c5b6057da87.zip
Initial import
-rw-r--r--.gitignore2
-rw-r--r--Makefile8
-rw-r--r--maildirlearn.c534
-rwxr-xr-xspamclassify.py128
4 files changed, 672 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..f5613ab
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+maildirlearn
+*.sw?
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..53c7571
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,8 @@
+CFLAGS=-Wall -O3 -march=native
+
+maildirlearn: maildirlearn.c
+ $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $<
+
+clean:
+ $(RM) -f maildirlearn
+
diff --git a/maildirlearn.c b/maildirlearn.c
new file mode 100644
index 0000000..91b66c8
--- /dev/null
+++ b/maildirlearn.c
@@ -0,0 +1,534 @@
+/*
+Copyright (c) 2012
+ Lars-Dominik Braun <lars@6xq.net>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* strdup */
+#define _BSD_SOURCE
+
+#include <sys/types.h>
+#include <regex.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <stdbool.h>
+#include <sys/inotify.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <stddef.h>
+
+/* spam status (tri-state + unknown) */
+typedef enum {UNKNOWN, SPAM, HAM, UNSURE} status_t;
+
+/* linked list */
+struct wdpath {
+ struct wdpath *next;
+ /* inotify watch fd */
+ int wd;
+ /* relative path */
+ char *path;
+};
+typedef struct wdpath wdpath_t;
+
+/* recursive dir reading, linked list */
+struct notifyDirread {
+ struct notifyDirread *next;
+ wdpath_t *wdp;
+ DIR *dir;
+};
+typedef struct notifyDirread notifyDirread_t;
+
+typedef struct {
+ /* inotify fd */
+ int fd;
+
+ /* wd hash table */
+ wdpath_t **tbl;
+ /* hash table size */
+ size_t len;
+
+ /* “root” dir we’re watching */
+ char *basedir;
+
+ char buf[sizeof (struct inotify_event)+1024];
+ /* can’t use the same buf for real and simulated events */
+ char direvbuf[sizeof (struct inotify_event)+1024];
+ size_t filled, read;
+
+ /* linked list and last element of linked list (O(1) append) */
+ notifyDirread_t *dir, *lastdir;
+} notify_t;
+
+/* watch descriptor to hash
+ */
+static unsigned int notifyHash (const notify_t *n, const int wd) {
+ assert (n != NULL);
+
+ return wd % n->len;
+}
+
+/* delete watch descriptor from hashtable
+ */
+static bool notifyTblDel (notify_t *n, const int wd) {
+ unsigned int h;
+ wdpath_t *cur, *prev;
+
+ assert (n != NULL);
+
+ h = notifyHash (n, wd);
+ cur = n->tbl[h];
+ prev = cur;
+
+ while (cur != NULL) {
+ if (cur->wd == wd) {
+ if (prev == cur) {
+ /* remove first entry in list */
+ n->tbl[h] = cur->next;
+ } else {
+ prev->next = cur->next;
+ }
+ printf ("[-] %i, %s\n", wd, cur->path);
+ free (cur->path);
+ free (cur);
+ return true;
+ }
+ prev = cur;
+ cur = cur->next;
+ }
+
+ return false;
+}
+
+/* add watch descriptor and path to hash table
+ */
+static wdpath_t *notifyTblAdd (notify_t *n, const int wd, const char *relpath) {
+ unsigned int h;
+ wdpath_t *cur;
+
+ assert (n != NULL);
+ assert (relpath != NULL);
+
+ h = notifyHash (n, wd);
+ cur = n->tbl[h];
+
+ if (cur != NULL) {
+ while (cur->next != NULL) {
+ if (cur->wd == wd) {
+ /* already have this one */
+ return cur;
+ }
+ cur = cur->next;
+ }
+ cur->next = malloc (sizeof (*cur->next));
+ assert (cur->next != NULL);
+ cur = cur->next;
+ } else {
+ cur = malloc (sizeof (*cur));
+ assert (cur != NULL);
+ n->tbl[h] = cur;
+ }
+
+ cur->wd = wd;
+ cur->path = strdup (relpath);
+ assert (cur->path != NULL);
+ cur->next = NULL;
+
+ printf ("[+] %i, %s\n", wd, relpath);
+
+ return cur;
+}
+
+/* create watch for relpath (relative to basedir)
+ */
+static bool notifyAdd (notify_t *n, const char *relpath) {
+ char path[1024];
+ int wd;
+ wdpath_t *wdp;
+
+ assert (n != NULL);
+ assert (n->basedir != NULL);
+ assert (relpath != NULL);
+
+ if (snprintf (path, sizeof (path), "%s%s", n->basedir, relpath) >= sizeof (path)) {
+ /* truncated */
+ return false;
+ }
+
+ if ((wd = inotify_add_watch (n->fd, path, IN_CREATE | IN_MOVED_TO)) == -1) {
+ perror ("inotify_add_watch");
+ return false;
+ }
+
+ wdp = notifyTblAdd (n, wd, relpath);
+ assert (wdp != NULL);
+
+ /* set up recursion, append to list */
+ notifyDirread_t *cur;
+ if (n->lastdir == NULL) {
+ n->dir = malloc (sizeof (*n->dir));
+ cur = n->dir;
+ } else {
+ n->lastdir->next = malloc (sizeof (*n->lastdir->next));
+ cur = n->lastdir->next;
+ }
+ cur->wdp = wdp;
+ cur->dir = NULL;
+ cur->next = NULL;
+ n->lastdir = cur;
+
+ return true;
+}
+
+/* retrieve path from hash table
+ */
+static const wdpath_t *notifyTblGet (notify_t *n, const int wd) {
+ unsigned int h;
+ wdpath_t *cur;
+
+ assert (n != NULL);
+
+ h = notifyHash (n, wd);
+ cur = n->tbl[h];
+
+ while (cur != NULL) {
+ if (cur->wd == wd) {
+ return cur;
+ }
+ cur = cur->next;
+ }
+
+ return NULL;
+}
+
+/* go to next dir in recursive dir list
+ */
+static void notifyDirNext (notify_t *n) {
+ notifyDirread_t *next;
+
+ closedir (n->dir->dir);
+
+ next = n->dir->next;
+ free (n->dir);
+ n->dir = next;
+ if (n->dir == NULL) {
+ /* this was the last dir in the list */
+ n->lastdir = NULL;
+ }
+}
+
+/* read next event and retrieve event/watch descriptor path structs
+ */
+static bool notifyRead (notify_t *n, const struct inotify_event **retEvent,
+ const wdpath_t **retWdp) {
+ struct inotify_event *event;
+ const wdpath_t *wdp;
+
+ assert (n != NULL);
+ assert (retEvent != NULL);
+ assert (retWdp != NULL);
+
+ while (n->dir != NULL) {
+ if (n->dir->dir != NULL) {
+ /* continue reading open dir */
+ struct dirent *dent;
+
+ while (true) {
+ dent = readdir (n->dir->dir);
+ if (dent != NULL) {
+ struct stat sb;
+ char fullpath[1024];
+
+ if (strcmp (dent->d_name, "..") == 0 || strcmp (dent->d_name, ".") == 0) {
+ continue;
+ }
+
+ assert (n->dir->wdp != NULL);
+ assert (n->dir->wdp->path != NULL);
+ if (snprintf (fullpath, sizeof (fullpath), "%s%s%s", n->basedir,
+ n->dir->wdp->path, dent->d_name) >= sizeof (fullpath)) {
+ /* overflow */
+ assert (0);
+ continue;
+ }
+
+ if (stat (fullpath, &sb) == -1) {
+ perror ("stat");
+ continue;
+ }
+ if (!S_ISDIR (sb.st_mode)) {
+ continue;
+ }
+
+ /* simulate create dir event */
+ event = (struct inotify_event *) n->direvbuf;
+ event->wd = n->dir->wdp->wd;
+ event->mask = IN_CREATE | IN_ISDIR;
+ strncpy (event->name, dent->d_name, sizeof (n->direvbuf)-sizeof (*event)-1);
+ event->len = strlen (dent->d_name)+1;
+
+ *retEvent = event;
+ *retWdp = n->dir->wdp;
+ return true;
+ } else {
+ notifyDirNext (n);
+ break;
+ }
+ /* never reached */
+ assert (0);
+ }
+ } else {
+ char fullpath[1024];
+
+ assert (n->dir != NULL && n->dir->wdp != NULL && n->dir->wdp->path != NULL);
+
+ snprintf (fullpath, sizeof (fullpath), "%s%s", n->basedir,
+ n->dir->wdp->path);
+ n->dir->dir = opendir (fullpath);
+ if (n->dir->dir == NULL) {
+ notifyDirNext (n);
+ }
+ }
+ }
+
+ if (n->read >= n->filled) {
+ ssize_t ret;
+
+ if ((ret = read (n->fd, n->buf, sizeof (n->buf))) == -1) {
+ perror ("read");
+ return false;
+ }
+
+ n->read = 0;
+ n->filled = ret;
+ }
+
+ event = (struct inotify_event *) (n->buf + n->read);
+ n->read += sizeof (*event)+event->len;
+
+ wdp = notifyTblGet (n, event->wd);
+ if (wdp == NULL) {
+ printf ("no wdp\n");
+ return false;
+ }
+
+ *retEvent = event;
+ *retWdp = wdp;
+
+ return true;
+}
+
+/* initialize notify struct: obtain inotify fd, add watch for basedir
+ */
+static bool notifyInit (notify_t *n, const size_t len, const char *basedir) {
+ assert (n != NULL);
+ assert (basedir != NULL);
+
+ memset (n, 0, sizeof (*n));
+
+ n->len = len;
+ n->tbl = calloc (n->len, sizeof (*n->tbl));
+ n->basedir = strdup (basedir);
+
+ if ((n->fd = inotify_init ()) == -1) {
+ perror ("inotify_init");
+ return false;
+ }
+
+ return notifyAdd (n, "");
+}
+
+/* get current bogofilter status, get desired status from path (via regex),
+ * set new bogofilter status
+ */
+static bool runBogofilter (const char *bogopath, const regex_t spamdirre,
+ const char *path) {
+ status_t curStatus = UNKNOWN, newStatus = UNKNOWN;
+ pid_t pid;
+
+ /* get current status */
+ pid = fork ();
+ if (pid == -1) {
+ perror ("fork");
+ return false;
+ } else if (pid == 0) {
+ /* child */
+ if (execl (bogopath, bogopath, "-I", path, (char *) NULL) == -1) {
+ perror ("execl");
+ return false;
+ }
+ /* never reached */
+ assert (0);
+ } else {
+ int status;
+
+ if (waitpid (pid, &status, 0) == -1) {
+ perror ("waitpid");
+ return false;
+ } else {
+ /* translate bogofilter exit status to internal status */
+ switch (WEXITSTATUS (status)) {
+ case 2:
+ printf ("curStatus=unsure\n");
+ curStatus = UNSURE;
+ break;
+
+ case 1:
+ printf ("curStatus=ham\n");
+ curStatus = HAM;
+ break;
+
+ case 0:
+ printf ("curStatus=spam\n");
+ curStatus = SPAM;
+ break;
+
+ default:
+ /* invalid status */
+ return false;
+ break;
+ }
+ }
+ } /* end if fork() */
+
+ /* user decided this is spam? */
+ if (regexec (&spamdirre, path, 0, NULL, 0) == 0) {
+ /* match */
+ printf ("new status: spam\n");
+ newStatus = SPAM;
+ } else {
+ printf ("new status: ham\n");
+ newStatus = HAM;
+ } /* end if regex */
+
+ /* set new status */
+ if (curStatus != newStatus) {
+ const char *bogoopts = NULL;
+
+ if (curStatus == UNSURE) {
+ if (newStatus == HAM) {
+ bogoopts = "-n";
+ } else if (newStatus == SPAM) {
+ bogoopts = "-s";
+ }
+ } else if (curStatus == SPAM && newStatus == HAM) {
+ bogoopts = "-Sn";
+ } else if (curStatus == HAM && newStatus == SPAM) {
+ bogoopts = "-Ns";
+ } else {
+ assert (0);
+ }
+
+ pid = fork ();
+ if (pid == -1) {
+ perror ("fork");
+ } else if (pid == 0) {
+ /* child */
+ if (execl (bogopath, bogopath, bogoopts, "-I", path,
+ (char *) NULL) == -1) {
+ perror ("execl");
+ return false;
+ }
+ /* never reached */
+ assert (0);
+ } else {
+ int status;
+
+ if (waitpid (pid, &status, 0) == -1) {
+ perror ("waitpid2");
+ return false;
+ } else {
+ printf ("bogofilter returned %i\n", WEXITSTATUS (status));
+ }
+ }
+ } /* end if curStatus != newStatus */
+
+ return true;
+}
+
+int main (int argc, char **argv) {
+ regex_t spamdirre, excludere;
+ const char bogopath[] = "bogofilter";
+ /* with '/' postfix */
+ const char watchdir[] = "mail/";
+ int running = 1;
+ notify_t n;
+
+ /* setup */
+ notifyInit (&n, 128, watchdir);
+
+ if (regcomp (&spamdirre, "mail/\\.Junk/", REG_EXTENDED) != 0) {
+ printf ("invalid spamdir re\n");
+ }
+
+ if (regcomp (&excludere, "mail/(\\.Unsure/|[^/]+/(tmp|.*:2,[A-S]*T[U-Z]*$)|.*dovecot)", REG_EXTENDED) != 0) {
+ printf ("invalid exclude re\n");
+ }
+
+ while (running) {
+ const struct inotify_event *event;
+ const wdpath_t *wdp;
+
+ if (notifyRead (&n, &event, &wdp)) {
+ char fullpath[1024], *relpath;
+
+ assert (event != NULL);
+ assert (wdp != NULL);
+
+ /* a word of warning: don’t use event->name if event->len is 0! */
+ if (snprintf (fullpath, sizeof (fullpath), "%s%s%s", n.basedir,
+ wdp->path, (event->len == 0) ? "" : event->name) >= sizeof (fullpath)) {
+ /* overflow */
+ assert (0);
+ continue;
+ }
+ relpath = fullpath + strlen (n.basedir);
+
+ /* is path excluded? */
+ if (regexec (&excludere, fullpath, 0, NULL, 0) == 0) {
+ continue;
+ }
+
+ //printf ("[!] full: %s\n", fullpath);
+
+ if (event->mask & IN_IGNORED || event->mask & IN_DELETE_SELF) {
+ /* watch was removed */
+ notifyTblDel (&n, event->wd);
+ } else if (event->mask & IN_ISDIR) {
+ /* FIXME: insane strncat */
+ strncat (fullpath, "/", sizeof (fullpath)-strlen (fullpath)-1);
+ notifyAdd (&n, relpath);
+ } else {
+ runBogofilter (bogopath, spamdirre, fullpath);
+ }
+ } else {
+ printf ("notifyRead failed\n");
+ }
+ }
+
+ regfree (&spamdirre);
+ regfree (&excludere);
+}
diff --git a/spamclassify.py b/spamclassify.py
new file mode 100755
index 0000000..7c40c27
--- /dev/null
+++ b/spamclassify.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python
+
+# Copyright (c) 2012
+# Lars-Dominik Braun <lars@6xq.net>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+import os, re, sys, subprocess, pyinotify
+
+class Classifier (object):
+ # files in junk are spam
+ SPAMDIR = re.compile (r'mail/\.Junk/(new|cur)')
+ # exclude: Unsure messages, temporary directory, trashed messages (see
+ # http://cr.yp.to/proto/maildir.html), dovecot files
+ EXCLUDE = re.compile (r'mail/(\.Unsure/(new|cur)|[^/]+/(tmp|.*:2,[A-S]*T[U-Z]*$)|.*dovecot)')
+
+ SPAM = 1
+ HAM = 2
+ UNSURE = 3
+
+ def statusFromFile (self, path):
+ """
+ filter mail through bogofilter and return spam status
+ """
+
+ ret = subprocess.call (['bogofilter', '-I', path])
+ if ret == 2:
+ return self.UNSURE
+ elif ret == 1:
+ return self.HAM
+ elif ret == 0:
+ return self.SPAM
+ elif ret == 3:
+ raise Exception ('bogofilter returned error')
+
+ def statusFromPath (self, path):
+ """
+ get spam status from file path
+ """
+
+ if self.SPAMDIR.search (path):
+ return self.SPAM
+ else:
+ return self.HAM
+
+ def isExcluded (self, path):
+ """
+ is path excluded?
+ """
+
+ return self.EXCLUDE.search (path)
+
+ def setStatus (self, curStatus, newStatus, path):
+ """
+ tell bogofilter to set new spam status for message
+ """
+ if curStatus != newStatus:
+ bogoopts = None
+ if curStatus == self.UNSURE:
+ if newStatus == self.HAM:
+ bogoopts = '-n'
+ elif newStatus == self.SPAM:
+ bogoopts = '-s'
+ elif curStatus == self.SPAM and newStatus == self.HAM:
+ bogoopts = '-Sn'
+ elif curStatus == self.HAM and newStatus == self.SPAM:
+ bogoopts = '-Ns'
+ print 'bogoopts %s' % bogoopts
+ ret = subprocess.call (['bogofilter', bogoopts, '-I', path])
+ print 'bogofilter returned %i' % ret
+ return True
+ else:
+ return False
+
+
+ def updateStatus (self, path):
+ """
+ does all the magic
+ """
+ if self.isExcluded (path):
+ return False
+ curStatus = self.statusFromFile (path)
+ newStatus = self.statusFromPath (path)
+ return self.setStatus (curStatus, newStatus, path)
+
+class EventHandler(pyinotify.ProcessEvent):
+ def __init__ (self, wm, mask, classifier):
+ pyinotify.ProcessEvent.__init__ (self)
+
+ self.wm = wm
+ self.mask = mask
+ self.classifier = classifier
+
+ def process_IN_MOVED_TO (self, event):
+ self.classifier.updateStatus (event.pathname)
+
+ def process_IN_CREATE (self, event):
+ # make sure new directories are watched as well
+ if event.mask & pyinotify.IN_ISDIR:
+ wdd = self.wm.add_watch(event.pathname, self.mask, rec=True)
+ else:
+ self.classifier.updateStatus (event.pathname)
+
+if __name__ == '__main__':
+ c = Classifier ()
+ wm = pyinotify.WatchManager()
+ mask = pyinotify.IN_MOVED_TO | pyinotify.IN_CREATE
+ handler = EventHandler(wm, mask, c)
+ notifier = pyinotify.Notifier(wm, handler)
+ wdd = wm.add_watch(os.path.expanduser ('mail/'), mask, rec=True)
+ notifier.loop()
+