Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ruleテーブル追加、Dockerfile追加 #6

Merged
merged 6 commits into from
Feb 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
logs/
er/
parquet/
.vscode/
.mypy_cache/
__pycache__/
.github/
.coderabbit.yaml
.gitignore
*.zip
22 changes: 22 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
FROM python:3.12-slim-bookworm
ENV APP_PATH /opt/apps
ENV DEBIAN_FRONTEND noninteractive
ENV HOME /home/app
ENV PATH $HOME/.local/bin:$PATH

USER root

COPY . ${APP_PATH}
WORKDIR ${APP_PATH}

RUN groupadd -r app && useradd -r -g app app
RUN mkdir -p ${HOME}
RUN set -e && bash build.sh
RUN chown -R app:app ${APP_PATH}
RUN chown -R app:app ${HOME}

USER app
WORKDIR ${APP_PATH}

ENTRYPOINT [ "/bin/bash", "-c" ]
CMD [ "python" ]
26 changes: 26 additions & 0 deletions build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash
set -e

apt-get update && apt-get upgrade -y && \
apt-get install -y --no-install-recommends curl \
libffi-dev libssl-dev \
python3-crcmod \
apt-transport-https \
lsb-release \
openssh-client \
gnupg

# Cloud SDKインストール
echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \
apt-get update -y && \
apt-get install google-cloud-sdk -y --no-install-recommends

rm -rf /var/lib/apt/lists/*
apt-get clean

# poetry install
curl -sSL https://install.python-poetry.org | python -

poetry config virtualenvs.create false
poetry install --only main --no-root
29 changes: 26 additions & 3 deletions er/mahjong.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
erDiagram
experiments ||--|{ games : ""
game_players ||--|{ games : ""
game_players ||--|{ players : ""
games ||--|| rules : ""
games ||--|{ kyokus : ""
games ||--|{ game_scores : ""
paiyamas ||--|{ kyokus : ""
Expand All @@ -15,8 +15,31 @@ erDiagram
string id PK
}

players {
string name PK
rules {
string game_id PK
bool enable_kuitan "喰い断"
bool enable_atozuke "後付け"
bool enable_pao "責任払い"
bool enable_tobi "とび"
bool enable_wareme "われめ"
bool enable_kuinaoshi "喰い直し"
bool enable_minus_riichi "点数マイナスのリーチ"
bool enable_ryanhan_shibari "5本場以上の二翻縛り"
bool enable_keiten "形式テンパイ"
bool enable_glass_pai "ガラス牌"
uint aka_type "0-3萬子、4-7筒子8-11索子の赤牌の数"
int shanyu_score "-1は無条件、0は西入なし、正数未満で西入"
int nannyu_score "-1は無条件、0は南入なし、正数未満で南入"
uint uradora_type "裏ドラ 0なし、1裏ドラあり、カン裏なし、2カン裏あり"
uint furiten_riichi_type "bit0流局時、bit1ツモで0はチョンボ、1はチョンボじゃない"
uint oyanagare_type "0東場、1南場、2西場、3北場ノーテン親流れの場合1が立つ"
uint double_ron_type "0:頭ハネ 1:ダブロンあり 2:トリプルあり"
uint kan_in_riichi_type "リーチ後のカン 0:禁止 1:待ち不変可 2:構成不変で可"
uint initial_score "開始点"
bool is_demo "AI対局か"
bool is_soku "速卓か"
bool is_sanma "三麻"
int level "天鳳レベル"
}

kyokus {
Expand Down
48 changes: 20 additions & 28 deletions extractor.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,25 @@
from urllib.parse import urlparse
from tqdm import tqdm
from typing import Optional
from datetime import datetime, timezone
import xml.etree.ElementTree as ET
import requests
import gzip
import re
import os

from scrape import parse_document, save_to_parquet

import argparse

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0"}

DOWNLOAD_PREFIX = "https://tenhou.net/sc/raw/dat/"


def extract_latest_logs(log_dir: str, output_dir: Optional[str]):
r = requests.get("https://tenhou.net/sc/raw/list.cgi", headers=headers)
def extract_logs(is_old: bool, log_dir: str):
old_query = "?old" if is_old else ""
r = requests.get(f"https://tenhou.net/sc/raw/list.cgi{old_query}", headers=headers)
atag = re.compile(r"<a\s+href=[\"'](?P<href>.*?)[\"']")

r.raise_for_status()

text = r.text.replace("list([\r\n", "").replace(");", "")
files = text.split(",\r\n")
prev_dt = None
seqno = 0

for archive_item in files:
if "html" in archive_item:
Expand All @@ -39,15 +35,11 @@ def extract_latest_logs(log_dir: str, output_dir: Optional[str]):

if dtstr is None:
raise Exception("date cannot found")
dt = datetime.strptime(dtstr[0], r"%Y%m%d").replace(tzinfo=timezone.utc)

if output_dir is not None and prev_dt is not None and prev_dt != dt:
save_to_parquet(output_dir, prev_dt)
seqno = 0
prev_dt = dt

page = requests.get(url, headers=headers)

page.raise_for_status()

data = gzip.decompress(page.content).decode("utf-8")

lines = data.split("\r\n")
Expand All @@ -65,17 +57,17 @@ def extract_latest_logs(log_dir: str, output_dir: Optional[str]):
# ファイルの存在チェック
filepath = os.path.join(log_dir, dtstr[0], f"{log_id}.xml")

if os.path.exists(filepath):
# 存在する場合は読み出す
if output_dir is not None:
tree = ET.parse(filepath)
seqno = parse_document(tree.getroot(), log_id, dt, seqno)
else:
log_file = requests.get(url, headers=headers)
log_file = requests.get(url, headers=headers)

with open(filepath, "w") as f:
f.write(log_file.text)


parser = argparse.ArgumentParser(description="tenho mahjong log etl tool")

parser.add_argument("--old", help="log file download from old or latest archive", action="store_true")
parser.add_argument("--output-dir", "-O", help="transform log output directory", type=str)

with open(filepath, "w") as f:
f.write(log_file.text)
args = parser.parse_args()

if output_dir is not None:
doc = ET.fromstring(log_file.text)
seqno = parse_document(doc, log_id, dt, seqno)
extract_logs(args.old, args.output_dir)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ readme = "README.md"
packages = [{include = "mahjong_etl"}]

[tool.poetry.dependencies]
python = "^3.9"
python = "^3.12"
requests = "^2.31.0"
pyarrow = "^14.0.1"
pandas = "^2.1.4"
Expand Down
6 changes: 6 additions & 0 deletions run_extractor.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash

python extractor.py -O logs --old
python extractor.py -O logs

gcloud storage rsync logs gs://${BUCKET_NAME}/logs --recursive -x '.*\.DS_Store'
93 changes: 70 additions & 23 deletions scrape.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Dict, Tuple, Any, Set, Optional
from typing import List, Dict, Tuple, Any, Optional
from datetime import datetime
import xml.etree.ElementTree as ET
import pyarrow as pa
Expand All @@ -10,21 +10,43 @@
import urllib.parse

# parquetに書き出すテーブルおよびレコード
Player = pa.schema([pa.field("name", pa.string())])

Game = pa.schema(
[
pa.field("id", pa.string()),
pa.field("tonpu", pa.bool_()),
pa.field("ariari", pa.bool_()),
pa.field("has_aka", pa.bool_()),
pa.field("demo", pa.bool_()),
pa.field("soku", pa.bool_()),
pa.field("level", pa.int32()),
pa.field("started_at", pa.date64()),
]
)

Rule = pa.schema(
[
pa.field("game_id", pa.string()),
pa.field("enable_kuitan", pa.bool_()),
pa.field("enable_atozuke", pa.bool_()),
pa.field("enable_pao", pa.bool_()),
pa.field("enable_tobi", pa.bool_()),
pa.field("enable_wareme", pa.bool_()),
pa.field("enable_kunaoshi", pa.bool_()),
pa.field("enable_kuriage", pa.bool_()),
pa.field("enable_agariyame", pa.bool_()),
pa.field("enable_minus_riichi", pa.bool_()),
pa.field("enable_ryanhan_shibari", pa.bool_()),
pa.field("enable_keiten", pa.bool_()),
pa.field("aka_type", pa.uint32()),
pa.field("shanyu_score", pa.int32()),
pa.field("nannyu_score", pa.int32()),
pa.field("uradora_type", pa.int32()),
pa.field("furiten_riichi_type", pa.uint32()),
pa.field("oyanagare_type", pa.uint32()),
pa.field("double_ron_type", pa.uint32()),
pa.field("kan_in_riichi_type", pa.uint32()),
pa.field("initial_score", pa.uint32()),
pa.field("is_demo", pa.bool_()),
pa.field("is_soku", pa.bool_()),
pa.field("is_sanma", pa.bool_()),
pa.field("level", pa.int32()),
]
)

GamePlayer = pa.schema([pa.field("game_id", pa.string()), pa.field("player_name", pa.string()), pa.field("player_index", pa.int32())])

GameScore = pa.schema(
Expand Down Expand Up @@ -102,8 +124,8 @@

Nagare = pa.schema([pa.field("kyoku_id", pa.int64()), pa.field("name", pa.string()), pa.field("score_diff", pa.list_(pa.int32(), 4))])

players: Set[str] = set()
games: List[Dict[str, Any]] = []
rules: List[Dict[str, Any]] = []
game_players: List[Dict[str, Any]] = []
game_scores: List[Dict[str, Any]] = []
kyokus: List[Dict[str, Any]] = []
Expand Down Expand Up @@ -348,10 +370,9 @@ def parse_document(root: ET.Element, game_id: str, dt: datetime, seqno: int) ->
if child.tag == "GO":
tp = int(child.attrib["type"])

if (tp & 0x02) != 0:
has_aka = True
else:
has_aka = False
has_aka = (tp & 0x02) != 0
is_sanma = (tp & 0x10) != 0

games.append(
{
"id": game_id,
Expand All @@ -365,6 +386,36 @@ def parse_document(root: ET.Element, game_id: str, dt: datetime, seqno: int) ->
"started_at": dt64,
}
)
rules.append(
{
"game_id": game_id,
"enable_kuitan": (tp & 0x04) == 0,
"enable_atozuke": (tp & 0x04) == 0,
"enable_pao": True,
"enable_tobi": True,
"enable_wareme": False,
"enable_kunaoshi": True,
"enable_kuriage": False,
"enable_agariyame": True,
"enable_minus_riichi": True,
"enable_ryanhan_shibari": False,
"enable_keiten": True,
"enable_glass_pai": False,
"aka_type": 73 if has_aka else 0,
"shanyu_score": 30000,
"nannyu_score": 0 if (tp & 0x08) == 0 else -1,
"uradora_type": 2,
"furiten_riichi_type": 3,
"oyanagare_type": 0xF,
"double_ron_type": 1,
"initial_score": 35000 if is_sanma else 25000,
"kan_in_riichi_type": (tp & 0x10000) >> 16,
"is_demo": (tp & 0x01) == 0,
"is_soku": (tp & 0x40) != 0,
"is_sanma": is_sanma,
"level": (tp & 0x20) >> 4 | (tp & 0x80) >> 7,
}
)
elif child.tag == "UN":
n0 = child.attrib.get("n0")
n1 = child.attrib.get("n1")
Expand Down Expand Up @@ -552,9 +603,6 @@ def parse_document(root: ET.Element, game_id: str, dt: datetime, seqno: int) ->
reach = False
actions.append({"kyoku_id": kyoku_id, "player_index": who, "seq": action_count, "type": "sutehai", "pais": p})
action_count += 1
for name in player_name.values():
players.add(name)

for idx, key in enumerate(sorted(player_name.items())):
game_players.append({"game_id": game_id, "player_name": key[1], "player_index": idx})

Expand All @@ -563,23 +611,23 @@ def parse_document(root: ET.Element, game_id: str, dt: datetime, seqno: int) ->

def save_to_parquet(basedir: str, dt: datetime):
datestr = dt.strftime(r"dt=%Y-%m-%d")
os.makedirs(os.path.join(basedir, "players", datestr), exist_ok=True)
os.makedirs(os.path.join(basedir, "games", datestr), exist_ok=True)
os.makedirs(os.path.join(basedir, "rules", datestr), exist_ok=True)
os.makedirs(os.path.join(basedir, "game_players", datestr), exist_ok=True)
os.makedirs(os.path.join(basedir, "game_scores", datestr), exist_ok=True)
os.makedirs(os.path.join(basedir, "kyokus", datestr), exist_ok=True)
os.makedirs(os.path.join(basedir, "haipais", datestr), exist_ok=True)
os.makedirs(os.path.join(basedir, "actions", datestr), exist_ok=True)
os.makedirs(os.path.join(basedir, "agaris", datestr), exist_ok=True)
os.makedirs(os.path.join(basedir, "nagares", datestr), exist_ok=True)
wp = pq.ParquetWriter(os.path.join(basedir, "players", datestr, "players.parquet"), schema=Player)
batch = pa.RecordBatch.from_pandas(pd.DataFrame([{"name": name} for name in players]), schema=Player)
wp.write_batch(batch)
wp.close()
wg = pq.ParquetWriter(os.path.join(basedir, "games", datestr, "games.parquet"), schema=Game)
batch = pa.RecordBatch.from_pandas(pd.DataFrame(games), schema=Game)
wg.write_batch(batch)
wg.close()
wgr = pq.ParquetWriter(os.path.join(basedir, "rules", datestr, "rules.parquet"), schema=Rule)
batch = pa.RecordBatch.from_pandas(pd.DataFrame(rules), schema=Rule)
wgr.write_batch(batch)
wgr.close()
wgp = pq.ParquetWriter(os.path.join(basedir, "game_players", datestr, "game_players.parquet"), schema=GamePlayer)
batch = pa.RecordBatch.from_pandas(pd.DataFrame(game_players), schema=GamePlayer)
wgp.write_batch(batch)
Expand Down Expand Up @@ -610,7 +658,6 @@ def save_to_parquet(basedir: str, dt: datetime):
batch = pa.RecordBatch.from_pandas(pd.DataFrame(nagares), schema=Nagare)
wn.write_batch(batch)
wn.close()
players.clear()
games.clear()
game_players.clear()
game_scores.clear()
Expand Down
Loading