warc prefix support:

- supports setting 'WARC_PREFIX' env var in browsertrix crawler (requires crawler 1.0.0-beta.4 or higher) - prefix set to <org slug>-<slug [crawl name | first seed host]> - using either crawl name, if provided, or host name of first seed. both are converted to slug (lowercase alphanum, seperate by dashes) - fixes #412
webrecorder · Feb 21, 2024 · db4b787 · db4b787
1 parent c1cffe9
commit db4b787
Show file tree

Hide file tree

Showing 6 changed files with 63 additions and 22 deletions.
diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py
@@ -33,7 +33,7 @@
     CrawlerChannel,
     CrawlerChannels,
 )
-from .utils import dt_now
+from .utils import dt_now, slug_from_name
 
 if TYPE_CHECKING:
     from .orgs import OrgOps
@@ -232,6 +232,7 @@ async def add_crawl_config(
             run_now=run_now,
             out_filename=out_filename,
             profile_filename=profile_filename or "",
+            warc_prefix=self.get_warc_prefix(org, crawlconfig),
         )
 
         if crawl_id and run_now:
@@ -298,6 +299,7 @@ async def readd_configmap(
             run_now=False,
             out_filename=self.default_filename_template,
             profile_filename=profile_filename or "",
+            warc_prefix=self.get_warc_prefix(org, crawlconfig),
         )
 
     async def update_crawl_config(
@@ -841,7 +843,10 @@ async def run_now(self, cid: UUID, org: Organization, user: User):
 
         try:
             crawl_id = await self.crawl_manager.create_crawl_job(
-                crawlconfig, org.storage, userid=str(user.id)
+                crawlconfig,
+                org.storage,
+                userid=str(user.id),
+                warc_prefix=self.get_warc_prefix(org, crawlconfig),
             )
             await self.add_new_crawl(crawl_id, crawlconfig, user, manual=True)
             return crawl_id
@@ -897,6 +902,20 @@ def get_channel_crawler_image(
         """Get crawler image name by id"""
         return self.crawler_images_map.get(crawler_channel or "")
 
+    def get_warc_prefix(self, org: Organization, crawlconfig: CrawlConfig) -> str:
+        """Generate WARC prefix slug from org slug, name or url
+        if no name is provided, hostname is used from url, otherwise
+        url is ignored"""
+        name = crawlconfig.name
+        if not name:
+            if crawlconfig.config.seeds and len(crawlconfig.config.seeds):
+                url = crawlconfig.config.seeds[0].url
+                parts = urllib.parse.urlsplit(url)
+                name = parts.netloc
+
+        name = slug_from_name(name or "")
+        return org.slug + "-" + name
+
 
 # ============================================================================
 # pylint: disable=too-many-locals

diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py
@@ -119,6 +119,7 @@ async def add_crawl_config(
         run_now: bool,
         out_filename: str,
         profile_filename: str,
+        warc_prefix: str,
     ) -> Optional[str]:
         """add new crawl, store crawl config in configmap"""
 
@@ -139,7 +140,10 @@ async def add_crawl_config(
 
         if run_now:
             crawl_id = await self.create_crawl_job(
-                crawlconfig, storage, str(crawlconfig.modifiedBy)
+                crawlconfig,
+                storage,
+                str(crawlconfig.modifiedBy),
+                warc_prefix,
             )
 
         await self._update_scheduled_job(crawlconfig)
@@ -151,6 +155,7 @@ async def create_crawl_job(
         crawlconfig: CrawlConfig,
         storage: StorageRef,
         userid: str,
+        warc_prefix: str,
     ) -> str:
         """create new crawl job from config"""
         cid = str(crawlconfig.id)
@@ -169,6 +174,7 @@ async def create_crawl_job(
             crawlconfig.crawlTimeout,
             crawlconfig.maxCrawlSize,
             manual=True,
+            warc_prefix=warc_prefix,
         )
 
     async def update_crawl_config(

diff --git a/backend/btrixcloud/k8sapi.py b/backend/btrixcloud/k8sapi.py
@@ -86,6 +86,7 @@ def new_crawl_job_yaml(
         max_crawl_size=0,
         manual=True,
         crawl_id=None,
+        warc_prefix="",
     ):
         """load job template from yaml"""
         if not crawl_id:
@@ -104,6 +105,7 @@ def new_crawl_job_yaml(
             "storage_name": str(storage),
             "manual": "1" if manual else "0",
             "crawler_channel": crawler_channel,
+            "warc_prefix": warc_prefix,
         }
 
         data = self.templates.env.get_template("crawl_job.yaml").render(params)

diff --git a/backend/btrixcloud/operator.py b/backend/btrixcloud/operator.py
@@ -538,6 +538,8 @@ async def sync_crawls(self, data: MCSyncData):
         params["storage_filename"] = configmap["STORE_FILENAME"]
         params["restart_time"] = spec.get("restartTime")
 
+        params["warc_prefix"] = spec.get("warcPrefix")
+
         params["redis_url"] = redis_url
 
         if spec.get("restartTime") != status.restartTime:
@@ -1651,26 +1653,10 @@ async def sync_cronjob_crawl(self, data: MCDecoratorSyncData):
 
         org = await self.org_ops.get_org_by_id(UUID(oid))
 
-        crawl_id, crawljob = self.new_crawl_job_yaml(
-            cid,
-            userid=userid,
-            oid=oid,
-            storage=org.storage,
-            crawler_channel=configmap.get("CRAWLER_CHANNEL", "default"),
-            scale=int(configmap.get("INITIAL_SCALE", 1)),
-            crawl_timeout=int(configmap.get("CRAWL_TIMEOUT", 0)),
-            max_crawl_size=int(configmap.get("MAX_CRAWL_SIZE", "0")),
-            manual=False,
-            crawl_id=crawl_id,
-        )
-
-        attachments = list(yaml.safe_load_all(crawljob))
-
-        if crawl_id in crawljobs:
-            attachments[0]["status"] = crawljobs[CJS][crawl_id]["status"]
+        warc_prefix = None
 
         if not actual_state:
-            # pylint: disable=duplicate-code
+            # cronjob doesn't exist yet
             crawlconfig = await self.crawl_config_ops.get_crawl_config(
                 UUID(cid), UUID(oid)
             )
@@ -1686,11 +1672,35 @@ async def sync_cronjob_crawl(self, data: MCDecoratorSyncData):
                 print(f"error: missing user for id {userid}")
                 return {"attachments": []}
 
+            warc_prefix = self.crawl_config_ops.get_warc_prefix(org, crawlconfig)
+
             await self.crawl_config_ops.add_new_crawl(
-                crawl_id, crawlconfig, user, manual=False
+                crawl_id,
+                crawlconfig,
+                user,
+                manual=False,
             )
             print("Scheduled Crawl Created: " + crawl_id)
 
+        crawl_id, crawljob = self.new_crawl_job_yaml(
+            cid,
+            userid=userid,
+            oid=oid,
+            storage=org.storage,
+            crawler_channel=configmap.get("CRAWLER_CHANNEL", "default"),
+            scale=int(configmap.get("INITIAL_SCALE", 1)),
+            crawl_timeout=int(configmap.get("CRAWL_TIMEOUT", 0)),
+            max_crawl_size=int(configmap.get("MAX_CRAWL_SIZE", "0")),
+            manual=False,
+            crawl_id=crawl_id,
+            warc_prefix=warc_prefix,
+        )
+
+        attachments = list(yaml.safe_load_all(crawljob))
+
+        if crawl_id in crawljobs:
+            attachments[0]["status"] = crawljobs[CJS][crawl_id]["status"]
+
         return {
             "attachments": attachments,
         }

diff --git a/chart/app-templates/crawl_job.yaml b/chart/app-templates/crawl_job.yaml
@@ -24,6 +24,7 @@ spec:
   manual: {{ manual }}
   crawlerChannel: "{{ crawler_channel }}"
   ttlSecondsAfterFinished: 30
+  warcPrefix: "{{ warc_prefix }}"
 
   storageName: "{{ storage_name }}"
 
diff --git a/chart/app-templates/crawler.yaml b/chart/app-templates/crawler.yaml
@@ -149,6 +149,9 @@ spec:
         - name: STORE_USER
           value: "{{ userid }}"
 
+        - name: WARC_PREFIX
+          value: "{{ warc_prefix }}"
+
     {% if crawler_socks_proxy_host %}
         - name: SOCKS_HOST
           value: "{{ crawler_socks_proxy_host }}"