@@ -49,6 +49,7 @@ import {
49
49
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS ,
50
50
ExitCodes ,
51
51
InterruptReason ,
52
+ SEED_REDIRECT_ADD_DELAY ,
52
53
} from "./util/constants.js" ;
53
54
54
55
import { AdBlockRules , BlockRuleDecl , BlockRules } from "./util/blockrules.js" ;
@@ -574,7 +575,14 @@ export class Crawler {
574
575
extraChromeArgs ( ) {
575
576
const args = [ ] ;
576
577
if ( this . params . lang ) {
577
- args . push ( `--accept-lang=${ this . params . lang } ` ) ;
578
+ if ( this . params . profile ) {
579
+ logger . warn (
580
+ "Ignoring --lang option with profile, using language configured in the profile" ,
581
+ { lang : this . params . lang } ,
582
+ ) ;
583
+ } else {
584
+ args . push ( `--accept-lang=${ this . params . lang } ` ) ;
585
+ }
578
586
}
579
587
return args ;
580
588
}
@@ -2071,6 +2079,8 @@ self.__bx_behaviors.selectMainBehavior();
2071
2079
2072
2080
const respUrl = resp . url ( ) . split ( "#" ) [ 0 ] ;
2073
2081
const isChromeError = page . url ( ) . startsWith ( "chrome-error://" ) ;
2082
+ let thisPageDelay = 0 ;
2083
+ let originalSeedId = null ;
2074
2084
2075
2085
if (
2076
2086
depth === 0 &&
@@ -2079,6 +2089,7 @@ self.__bx_behaviors.selectMainBehavior();
2079
2089
respUrl + "/" !== url &&
2080
2090
! downloadResponse
2081
2091
) {
2092
+ originalSeedId = data . seedId ;
2082
2093
data . seedId = await this . crawlState . addExtraSeed (
2083
2094
this . seeds ,
2084
2095
this . numOriginalSeeds ,
@@ -2090,6 +2101,7 @@ self.__bx_behaviors.selectMainBehavior();
2090
2101
newUrl : respUrl ,
2091
2102
seedId : data . seedId ,
2092
2103
} ) ;
2104
+ thisPageDelay = SEED_REDIRECT_ADD_DELAY ;
2093
2105
}
2094
2106
2095
2107
const status = resp . status ( ) ;
@@ -2176,7 +2188,7 @@ self.__bx_behaviors.selectMainBehavior();
2176
2188
2177
2189
await this . netIdle ( page , logDetails ) ;
2178
2190
2179
- await this . awaitPageLoad ( page . mainFrame ( ) , logDetails ) ;
2191
+ await this . awaitPageLoad ( page . mainFrame ( ) , thisPageDelay , logDetails ) ;
2180
2192
2181
2193
// skip extraction if at max depth
2182
2194
if ( seed . isAtMaxDepth ( depth , extraHops ) ) {
@@ -2190,6 +2202,27 @@ self.__bx_behaviors.selectMainBehavior();
2190
2202
"links" ,
2191
2203
) ;
2192
2204
2205
+ const pageUrl = page . url ( ) . split ( "#" ) [ 0 ] ;
2206
+
2207
+ if ( depth === 0 && respUrl !== urlNoHash ) {
2208
+ if ( pageUrl === urlNoHash && originalSeedId !== null ) {
2209
+ logger . info ( "Seed page redirected back to original seed" , { pageUrl } ) ;
2210
+ data . seedId = originalSeedId ;
2211
+ } else {
2212
+ data . seedId = await this . crawlState . addExtraSeed (
2213
+ this . seeds ,
2214
+ this . numOriginalSeeds ,
2215
+ data . seedId ,
2216
+ pageUrl ,
2217
+ ) ;
2218
+ logger . info ( "Seed page redirected, adding redirected seed" , {
2219
+ origUrl : respUrl ,
2220
+ newUrl : pageUrl ,
2221
+ seedId : data . seedId ,
2222
+ } ) ;
2223
+ }
2224
+ }
2225
+
2193
2226
await this . extractLinks ( page , data , this . params . selectLinks , logDetails ) ;
2194
2227
}
2195
2228
@@ -2211,7 +2244,7 @@ self.__bx_behaviors.selectMainBehavior();
2211
2244
}
2212
2245
}
2213
2246
2214
- async awaitPageLoad ( frame : Frame , logDetails : LogDetails ) {
2247
+ async awaitPageLoad ( frame : Frame , tempDelay : number , logDetails : LogDetails ) {
2215
2248
logger . debug (
2216
2249
"Waiting for custom page load via behavior" ,
2217
2250
logDetails ,
@@ -2230,11 +2263,13 @@ self.__bx_behaviors.selectMainBehavior();
2230
2263
logger . warn ( "Waiting for custom page load failed" , e , "behavior" ) ;
2231
2264
}
2232
2265
2233
- if ( this . params . postLoadDelay ) {
2266
+ const delay = tempDelay + this . params . postLoadDelay ;
2267
+
2268
+ if ( delay ) {
2234
2269
logger . info ( "Awaiting post load delay" , {
2235
- seconds : this . params . postLoadDelay ,
2270
+ seconds : delay ,
2236
2271
} ) ;
2237
- await sleep ( this . params . postLoadDelay ) ;
2272
+ await sleep ( delay ) ;
2238
2273
}
2239
2274
}
2240
2275
0 commit comments