Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 3e1f0cf

Browse files
committedMar 20, 2025·
several fixes for sso-redirect workflow:
- if a seed page redirects to another page, and then back (such as for sso), ensure original seed is used for link extraction - don't allow direct fetch if no mime type at all - don't add --lang if using profile, display warning, as language override may invalidate profile settings - add temp extra delay if seed page redirects, to ensure any sso-related redirects finish
1 parent 56e19e1 commit 3e1f0cf

File tree

3 files changed

+43
-7
lines changed

3 files changed

+43
-7
lines changed
 

‎src/crawler.ts

+41-6
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ import {
4949
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
5050
ExitCodes,
5151
InterruptReason,
52+
SEED_REDIRECT_ADD_DELAY,
5253
} from "./util/constants.js";
5354

5455
import { AdBlockRules, BlockRuleDecl, BlockRules } from "./util/blockrules.js";
@@ -574,7 +575,14 @@ export class Crawler {
574575
extraChromeArgs() {
575576
const args = [];
576577
if (this.params.lang) {
577-
args.push(`--accept-lang=${this.params.lang}`);
578+
if (this.params.profile) {
579+
logger.warn(
580+
"Ignoring --lang option with profile, using language configured in the profile",
581+
{ lang: this.params.lang },
582+
);
583+
} else {
584+
args.push(`--accept-lang=${this.params.lang}`);
585+
}
578586
}
579587
return args;
580588
}
@@ -2071,6 +2079,8 @@ self.__bx_behaviors.selectMainBehavior();
20712079

20722080
const respUrl = resp.url().split("#")[0];
20732081
const isChromeError = page.url().startsWith("chrome-error://");
2082+
let thisPageDelay = 0;
2083+
let originalSeedId = null;
20742084

20752085
if (
20762086
depth === 0 &&
@@ -2079,6 +2089,7 @@ self.__bx_behaviors.selectMainBehavior();
20792089
respUrl + "/" !== url &&
20802090
!downloadResponse
20812091
) {
2092+
originalSeedId = data.seedId;
20822093
data.seedId = await this.crawlState.addExtraSeed(
20832094
this.seeds,
20842095
this.numOriginalSeeds,
@@ -2090,6 +2101,7 @@ self.__bx_behaviors.selectMainBehavior();
20902101
newUrl: respUrl,
20912102
seedId: data.seedId,
20922103
});
2104+
thisPageDelay = SEED_REDIRECT_ADD_DELAY;
20932105
}
20942106

20952107
const status = resp.status();
@@ -2176,7 +2188,7 @@ self.__bx_behaviors.selectMainBehavior();
21762188

21772189
await this.netIdle(page, logDetails);
21782190

2179-
await this.awaitPageLoad(page.mainFrame(), logDetails);
2191+
await this.awaitPageLoad(page.mainFrame(), thisPageDelay, logDetails);
21802192

21812193
// skip extraction if at max depth
21822194
if (seed.isAtMaxDepth(depth, extraHops)) {
@@ -2190,6 +2202,27 @@ self.__bx_behaviors.selectMainBehavior();
21902202
"links",
21912203
);
21922204

2205+
const pageUrl = page.url().split("#")[0];
2206+
2207+
if (depth === 0 && respUrl !== urlNoHash) {
2208+
if (pageUrl === urlNoHash && originalSeedId !== null) {
2209+
logger.info("Seed page redirected back to original seed", { pageUrl });
2210+
data.seedId = originalSeedId;
2211+
} else {
2212+
data.seedId = await this.crawlState.addExtraSeed(
2213+
this.seeds,
2214+
this.numOriginalSeeds,
2215+
data.seedId,
2216+
pageUrl,
2217+
);
2218+
logger.info("Seed page redirected, adding redirected seed", {
2219+
origUrl: respUrl,
2220+
newUrl: pageUrl,
2221+
seedId: data.seedId,
2222+
});
2223+
}
2224+
}
2225+
21932226
await this.extractLinks(page, data, this.params.selectLinks, logDetails);
21942227
}
21952228

@@ -2211,7 +2244,7 @@ self.__bx_behaviors.selectMainBehavior();
22112244
}
22122245
}
22132246

2214-
async awaitPageLoad(frame: Frame, logDetails: LogDetails) {
2247+
async awaitPageLoad(frame: Frame, tempDelay: number, logDetails: LogDetails) {
22152248
logger.debug(
22162249
"Waiting for custom page load via behavior",
22172250
logDetails,
@@ -2230,11 +2263,13 @@ self.__bx_behaviors.selectMainBehavior();
22302263
logger.warn("Waiting for custom page load failed", e, "behavior");
22312264
}
22322265

2233-
if (this.params.postLoadDelay) {
2266+
const delay = tempDelay + this.params.postLoadDelay;
2267+
2268+
if (delay) {
22342269
logger.info("Awaiting post load delay", {
2235-
seconds: this.params.postLoadDelay,
2270+
seconds: delay,
22362271
});
2237-
await sleep(this.params.postLoadDelay);
2272+
await sleep(delay);
22382273
}
22392274
}
22402275

‎src/replaycrawler.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -446,7 +446,7 @@ export class ReplayCrawler extends Crawler {
446446
// optionally reload (todo: reevaluate if this is needed)
447447
// await page.reload();
448448

449-
await this.awaitPageLoad(replayFrame, logDetails);
449+
await this.awaitPageLoad(replayFrame, 0, logDetails);
450450

451451
data.isHTMLPage = true;
452452

‎src/util/constants.ts

+1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ export const DEFAULT_MAX_RETRIES = 2;
3232
export const FETCH_HEADERS_TIMEOUT_SECS = 30;
3333
export const PAGE_OP_TIMEOUT_SECS = 5;
3434
export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30;
35+
export const SEED_REDIRECT_ADD_DELAY = 20;
3536

3637
export type ExtractSelector = {
3738
selector: string;

0 commit comments

Comments
 (0)
Please sign in to comment.