From efec8c0984da26c50c232336cf4219db79c0022a Mon Sep 17 00:00:00 2001 From: Ryo Fujimura <36524811+ryofujimura@users.noreply.github.com> Date: Fri, 21 Feb 2025 08:56:22 +0000 Subject: [PATCH] addresses the error error with" global flags not at the start of the expression --- wikiextractor/extract.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/wikiextractor/extract.py b/wikiextractor/extract.py index a00e23d..5ec2887 100644 --- a/wikiextractor/extract.py +++ b/wikiextractor/extract.py @@ -379,14 +379,17 @@ def dropSpans(spans, text): # \p{Zs} is unicode 'separator, space' category. It covers the space 0x20 # as well as U+3000 is IDEOGRAPHIC SPACE for bug 19052 EXT_LINK_URL_CLASS = r'[^][<>"\x00-\x20\x7F\s]' -ExtLinkBracketedRegex = re.compile( - '\[(((?i)' + '|'.join(wgUrlProtocols) + ')' + EXT_LINK_URL_CLASS + r'+)\s*([^\]\x00-\x08\x0a-\x1F]*?)\]', - re.S | re.U) +# ExtLinkBracketedRegex = re.compile( +# '\[(((?i)' + '|'.join(wgUrlProtocols) + ')' + EXT_LINK_URL_CLASS + r'+)\s*([^\]\x00-\x08\x0a-\x1F]*?)\]', +# re.S | re.U) +# EXT_IMAGE_REGEX = re.compile( +# r"""^(http://|https://)([^][<>"\x00-\x20\x7F\s]+) +# /([A-Za-z0-9_.,~%\-+&;#*?!=()@\x80-\xFF]+)\.((?i)gif|png|jpg|jpeg)$""", +# re.X | re.S | re.U) +ExtLinkBracketedRegex = re.compile(r'(?i)\[(?:[^[]+?)\[(.+?)\](?:[^[]+?)\]') EXT_IMAGE_REGEX = re.compile( - r"""^(http://|https://)([^][<>"\x00-\x20\x7F\s]+) - /([A-Za-z0-9_.,~%\-+&;#*?!=()@\x80-\xFF]+)\.((?i)gif|png|jpg|jpeg)$""", - re.X | re.S | re.U) - + r'(?i)(?s)(?:(?:image|img|media):+|\[\[)(?P[^\n[]+?\.(?:gif|GIF|jpg|JPG|jpeg|JPEG|png|PNG|tif|TIF|tiff|TIFF|svg|SVG|xcf|XCF|pdf|PDF|webp|WEBP))\]\]' +) def replaceExternalLinks(text): s = ''