Skip to content

Commit b8e85c0

Browse files
committed
Rewrite README HTML to fix broken internal links, closes #58
Refs simonw/datasette.io#46
1 parent 1d95844 commit b8e85c0

File tree

2 files changed

+54
-23
lines changed

2 files changed

+54
-23
lines changed

github_to_sqlite/utils.py

+22-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import base64
22
import requests
3+
import re
34
import time
45
import yaml
56

@@ -766,11 +767,31 @@ def fetch_readme(token, full_name, html=False):
766767
if response.status_code != 200:
767768
return None
768769
if html:
769-
return response.text
770+
return rewrite_readme_html(response.text)
770771
else:
771772
return base64.b64decode(response.json()["content"]).decode("utf-8")
772773

773774

775+
_href_re = re.compile(r'\shref="#([^"]+)"')
776+
_id_re = re.compile(r'\sid="([^"]+)"')
777+
778+
779+
def rewrite_readme_html(html):
780+
# href="#filtering-tables" => href="#user-content-filtering-tables"
781+
hrefs = set(_href_re.findall(html))
782+
ids = _id_re.findall(html)
783+
for href in hrefs:
784+
if "user-content-{}".format(href) not in ids:
785+
continue
786+
if href.startswith("user-content-"):
787+
continue
788+
# This href should be rewritten to user-content
789+
html = html.replace(
790+
' href="#{}"'.format(href), ' href="#user-content-{}"'.format(href)
791+
)
792+
return html
793+
794+
774795
def fetch_workflows(token, full_name):
775796
headers = make_headers(token)
776797
url = "https://api.github.com/repos/{}/contents/.github/workflows".format(full_name)

tests/test_repos.py

+32-22
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,17 @@
88
from github_to_sqlite import cli
99
import pytest
1010

11+
README_HTML = """
12+
<li><a href="#filtering-tables">Filtering tables</a></li>
13+
...
14+
<h3><a id="user-content-filtering-tables" class="anchor" aria-hidden="true" href="#filtering-tables">#</a>Filtering tables</h3>
15+
"""
16+
EXPECTED_README_HTML = """
17+
<li><a href="#user-content-filtering-tables">Filtering tables</a></li>
18+
...
19+
<h3><a id="user-content-filtering-tables" class="anchor" aria-hidden="true" href="#user-content-filtering-tables">#</a>Filtering tables</h3>
20+
"""
21+
1122

1223
@pytest.fixture
1324
def mocked(requests_mock):
@@ -21,27 +32,14 @@ def mocked(requests_mock):
2132
)
2233
requests_mock.get(
2334
"https://api.github.com/repos/dogsheep/github-to-sqlite/readme",
24-
text="<h1>This is the README</h1>",
35+
text=README_HTML,
2536
additional_matcher=lambda request: request.headers.get("accept")
2637
== "application/vnd.github.VERSION.html",
2738
)
2839

2940

3041
def test_repos(mocked, tmpdir):
31-
runner = CliRunner()
32-
db_path = str(tmpdir / "test.db")
33-
result = runner.invoke(
34-
cli.cli,
35-
[
36-
"repos",
37-
db_path,
38-
"-r",
39-
"dogsheep/github-to-sqlite",
40-
"--readme",
41-
"--readme-html",
42-
],
43-
)
44-
assert 0 == result.exit_code
42+
db_path = _run_repos(tmpdir)
4543
db = sqlite_utils.Database(db_path)
4644
assert db.table_names() == [
4745
"users",
@@ -67,11 +65,10 @@ def test_repos(mocked, tmpdir):
6765
repo = next(iter(db["repos"].rows))
6866
assert repo["full_name"] == "dogsheep/github-to-sqlite"
6967
assert repo["readme"] == "# This is the README"
70-
assert repo["readme_html"] == "<h1>This is the README</h1>"
68+
assert repo["readme_html"] is not None
7169

7270

7371
def test_repos_readme_not_available(requests_mock, tmpdir):
74-
runner = CliRunner()
7572
requests_mock.get(
7673
"https://api.github.com/repos/dogsheep/github-to-sqlite",
7774
json=json.load(open(pathlib.Path(__file__).parent / "repo.json")),
@@ -80,6 +77,23 @@ def test_repos_readme_not_available(requests_mock, tmpdir):
8077
"https://api.github.com/repos/dogsheep/github-to-sqlite/readme",
8178
status_code=400,
8279
)
80+
db_path = _run_repos(tmpdir)
81+
db = sqlite_utils.Database(db_path)
82+
row = list(db["repos"].rows)[0]
83+
assert row["name"] == "github-to-sqlite"
84+
assert row["readme"] is None
85+
assert row["readme_html"] is None
86+
87+
88+
def test_readme_internal_links_are_rewritten(mocked, tmpdir):
89+
# https://github.com/dogsheep/github-to-sqlite/issues/58
90+
db_path = _run_repos(tmpdir)
91+
db = sqlite_utils.Database(db_path)
92+
assert list(db["repos"].rows)[0]["readme_html"] == EXPECTED_README_HTML
93+
94+
95+
def _run_repos(tmpdir):
96+
runner = CliRunner()
8397
db_path = str(tmpdir / "test.db")
8498
result = runner.invoke(
8599
cli.cli,
@@ -93,8 +107,4 @@ def test_repos_readme_not_available(requests_mock, tmpdir):
93107
],
94108
)
95109
assert 0 == result.exit_code
96-
db = sqlite_utils.Database(db_path)
97-
row = list(db["repos"].rows)[0]
98-
assert row["name"] == "github-to-sqlite"
99-
assert row["readme"] is None
100-
assert row["readme_html"] is None
110+
return db_path

0 commit comments

Comments
 (0)