From b30a44cfe9456deafc83e008f8501c391cd1e258 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sun, 29 Apr 2018 10:44:33 +0200 Subject: Move page archiving logic to SinglePageController In preparation for recursive crawls. --- crocoite/task.py | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 crocoite/task.py (limited to 'crocoite/task.py') diff --git a/crocoite/task.py b/crocoite/task.py new file mode 100644 index 0000000..39900a5 --- /dev/null +++ b/crocoite/task.py @@ -0,0 +1,71 @@ +# Copyright (c) 2017–2018 crocoite contributors +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +""" +Celery distributed tasks +""" + +import os + +from urllib.parse import urlsplit +from datetime import datetime + +from celery import Celery +from celery.utils.log import get_task_logger + +from .controller import SinglePageController, ControllerSettings +from . import behavior + +app = Celery ('crocoite.distributed') +app.config_from_object('celeryconfig') +logger = get_task_logger('crocoite.distributed.archive') + +@app.task(bind=True, track_started=True) +def archive (self, url, settings, enabledBehaviorNames): + """ + Archive a single URL + + Supports these config keys (celeryconfig): + + warc_filename = '{domain}-{date}-{id}.warc.gz' + temp_dir = '/tmp/' + finished_dir = '/tmp/finished' + """ + + parsedUrl = urlsplit (url) + outFile = app.conf.warc_filename.format ( + id=self.request.id, + domain=parsedUrl.hostname.replace ('/', '-'), + date=datetime.utcnow ().isoformat (), + ) + outPath = os.path.join (app.conf.temp_dir, outFile) + fd = open (outPath, 'wb') + + enabledBehavior = list (filter (lambda x: x.name in enabledBehaviorNames, behavior.available)) + settings = ControllerSettings (**settings) + controller = SinglePageController (url, fd, behavior=enabledBehavior, settings=settings) + ret = controller.run () + + os.makedirs (app.conf.finished_dir, exist_ok=True) + outPath = os.path.join (app.conf.finished_dir, outFile) + os.rename (fd.name, outPath) + + return ret + -- cgit v1.2.3