Skip to content

Commit

Permalink
web-slang: Trim wikitext templates before processing
Browse files Browse the repository at this point in the history
wikitext templates messed the output up.
  • Loading branch information
Rongronggg9 authored and felixonmars committed May 9, 2024
1 parent f4caa92 commit a76ad0a
Showing 1 changed file with 29 additions and 0 deletions.
29 changes: 29 additions & 0 deletions zhwiki-web-slang.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,36 @@ def fetch():
return wikitext


def trim_templates(wikitext):
template_level = 0
new_wikitext = ""
while True:
assert template_level >= 0, ValueError("Unbalanced template in wikitext:\n" + wikitext)
pre_open, open_tag, post_open = wikitext.partition("{{")
pre_close, close_tag, post_close = wikitext.partition("}}")
if open_tag and (not close_tag or len(pre_open) < len(pre_close)):
# Template starts here ({{)
wikitext = post_open
if template_level == 0:
new_wikitext += pre_open
template_level += 1
elif close_tag:
# Template ends here (}})
wikitext = post_close
template_level -= 1
else:
# No more templates
assert template_level == 0, ValueError("Unbalanced template in wikitext:\n" + wikitext)
# The assertion below must be true on earth
assert open_tag == close_tag == "", RuntimeError("Cosmic radiation detected")
new_wikitext += wikitext
break

return new_wikitext


def process(wikitext):
wikitext = trim_templates(wikitext)
words = collections.OrderedDict()

def add_word(word):
Expand Down

0 comments on commit a76ad0a

Please sign in to comment.