Skip to content

Commit

Permalink
warc prefix support:
Browse files Browse the repository at this point in the history
- supports setting 'WARC_PREFIX' env var in browsertrix crawler (requires crawler 1.0.0-beta.4 or higher)
- prefix set to <org slug>-<slug [crawl name | first seed host]>
- using either crawl name, if provided, or host name of first seed. both are converted to slug (lowercase alphanum, seperate by dashes)
- fixes #412
  • Loading branch information
ikreymer committed Feb 21, 2024
1 parent c1cffe9 commit db4b787
Show file tree
Hide file tree
Showing 6 changed files with 63 additions and 22 deletions.
23 changes: 21 additions & 2 deletions backend/btrixcloud/crawlconfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
CrawlerChannel,
CrawlerChannels,
)
from .utils import dt_now
from .utils import dt_now, slug_from_name

if TYPE_CHECKING:
from .orgs import OrgOps
Expand Down Expand Up @@ -232,6 +232,7 @@ async def add_crawl_config(
run_now=run_now,
out_filename=out_filename,
profile_filename=profile_filename or "",
warc_prefix=self.get_warc_prefix(org, crawlconfig),
)

if crawl_id and run_now:
Expand Down Expand Up @@ -298,6 +299,7 @@ async def readd_configmap(
run_now=False,
out_filename=self.default_filename_template,
profile_filename=profile_filename or "",
warc_prefix=self.get_warc_prefix(org, crawlconfig),
)

async def update_crawl_config(
Expand Down Expand Up @@ -841,7 +843,10 @@ async def run_now(self, cid: UUID, org: Organization, user: User):

try:
crawl_id = await self.crawl_manager.create_crawl_job(
crawlconfig, org.storage, userid=str(user.id)
crawlconfig,
org.storage,
userid=str(user.id),
warc_prefix=self.get_warc_prefix(org, crawlconfig),
)
await self.add_new_crawl(crawl_id, crawlconfig, user, manual=True)
return crawl_id
Expand Down Expand Up @@ -897,6 +902,20 @@ def get_channel_crawler_image(
"""Get crawler image name by id"""
return self.crawler_images_map.get(crawler_channel or "")

def get_warc_prefix(self, org: Organization, crawlconfig: CrawlConfig) -> str:
"""Generate WARC prefix slug from org slug, name or url
if no name is provided, hostname is used from url, otherwise
url is ignored"""
name = crawlconfig.name
if not name:
if crawlconfig.config.seeds and len(crawlconfig.config.seeds):
url = crawlconfig.config.seeds[0].url
parts = urllib.parse.urlsplit(url)
name = parts.netloc

name = slug_from_name(name or "")
return org.slug + "-" + name


# ============================================================================
# pylint: disable=too-many-locals
Expand Down
8 changes: 7 additions & 1 deletion backend/btrixcloud/crawlmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ async def add_crawl_config(
run_now: bool,
out_filename: str,
profile_filename: str,
warc_prefix: str,
) -> Optional[str]:
"""add new crawl, store crawl config in configmap"""

Expand All @@ -139,7 +140,10 @@ async def add_crawl_config(

if run_now:
crawl_id = await self.create_crawl_job(
crawlconfig, storage, str(crawlconfig.modifiedBy)
crawlconfig,
storage,
str(crawlconfig.modifiedBy),
warc_prefix,
)

await self._update_scheduled_job(crawlconfig)
Expand All @@ -151,6 +155,7 @@ async def create_crawl_job(
crawlconfig: CrawlConfig,
storage: StorageRef,
userid: str,
warc_prefix: str,
) -> str:
"""create new crawl job from config"""
cid = str(crawlconfig.id)
Expand All @@ -169,6 +174,7 @@ async def create_crawl_job(
crawlconfig.crawlTimeout,
crawlconfig.maxCrawlSize,
manual=True,
warc_prefix=warc_prefix,
)

async def update_crawl_config(
Expand Down
2 changes: 2 additions & 0 deletions backend/btrixcloud/k8sapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def new_crawl_job_yaml(
max_crawl_size=0,
manual=True,
crawl_id=None,
warc_prefix="",
):
"""load job template from yaml"""
if not crawl_id:
Expand All @@ -104,6 +105,7 @@ def new_crawl_job_yaml(
"storage_name": str(storage),
"manual": "1" if manual else "0",
"crawler_channel": crawler_channel,
"warc_prefix": warc_prefix,
}

data = self.templates.env.get_template("crawl_job.yaml").render(params)
Expand Down
48 changes: 29 additions & 19 deletions backend/btrixcloud/operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,6 +538,8 @@ async def sync_crawls(self, data: MCSyncData):
params["storage_filename"] = configmap["STORE_FILENAME"]
params["restart_time"] = spec.get("restartTime")

params["warc_prefix"] = spec.get("warcPrefix")

params["redis_url"] = redis_url

if spec.get("restartTime") != status.restartTime:
Expand Down Expand Up @@ -1651,26 +1653,10 @@ async def sync_cronjob_crawl(self, data: MCDecoratorSyncData):

org = await self.org_ops.get_org_by_id(UUID(oid))

crawl_id, crawljob = self.new_crawl_job_yaml(
cid,
userid=userid,
oid=oid,
storage=org.storage,
crawler_channel=configmap.get("CRAWLER_CHANNEL", "default"),
scale=int(configmap.get("INITIAL_SCALE", 1)),
crawl_timeout=int(configmap.get("CRAWL_TIMEOUT", 0)),
max_crawl_size=int(configmap.get("MAX_CRAWL_SIZE", "0")),
manual=False,
crawl_id=crawl_id,
)

attachments = list(yaml.safe_load_all(crawljob))

if crawl_id in crawljobs:
attachments[0]["status"] = crawljobs[CJS][crawl_id]["status"]
warc_prefix = None

if not actual_state:
# pylint: disable=duplicate-code
# cronjob doesn't exist yet
crawlconfig = await self.crawl_config_ops.get_crawl_config(
UUID(cid), UUID(oid)
)
Expand All @@ -1686,11 +1672,35 @@ async def sync_cronjob_crawl(self, data: MCDecoratorSyncData):
print(f"error: missing user for id {userid}")
return {"attachments": []}

warc_prefix = self.crawl_config_ops.get_warc_prefix(org, crawlconfig)

await self.crawl_config_ops.add_new_crawl(
crawl_id, crawlconfig, user, manual=False
crawl_id,
crawlconfig,
user,
manual=False,
)
print("Scheduled Crawl Created: " + crawl_id)

crawl_id, crawljob = self.new_crawl_job_yaml(
cid,
userid=userid,
oid=oid,
storage=org.storage,
crawler_channel=configmap.get("CRAWLER_CHANNEL", "default"),
scale=int(configmap.get("INITIAL_SCALE", 1)),
crawl_timeout=int(configmap.get("CRAWL_TIMEOUT", 0)),
max_crawl_size=int(configmap.get("MAX_CRAWL_SIZE", "0")),
manual=False,
crawl_id=crawl_id,
warc_prefix=warc_prefix,
)

attachments = list(yaml.safe_load_all(crawljob))

if crawl_id in crawljobs:
attachments[0]["status"] = crawljobs[CJS][crawl_id]["status"]

return {
"attachments": attachments,
}
Expand Down
1 change: 1 addition & 0 deletions chart/app-templates/crawl_job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ spec:
manual: {{ manual }}
crawlerChannel: "{{ crawler_channel }}"
ttlSecondsAfterFinished: 30
warcPrefix: "{{ warc_prefix }}"

storageName: "{{ storage_name }}"

3 changes: 3 additions & 0 deletions chart/app-templates/crawler.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,9 @@ spec:
- name: STORE_USER
value: "{{ userid }}"

- name: WARC_PREFIX
value: "{{ warc_prefix }}"

{% if crawler_socks_proxy_host %}
- name: SOCKS_HOST
value: "{{ crawler_socks_proxy_host }}"
Expand Down

0 comments on commit db4b787

Please sign in to comment.