scpwiki · emmiegit · May 6, 2024 · May 3, 2024 · May 3, 2024 · May 5, 2024
diff --git a/Cargo.toml b/Cargo.toml
@@ -8,7 +8,7 @@ keywords = ["wikidot", "wikijump", "ftml", "parsing", "html"]
 categories = ["parser-implementations"]
 exclude = [".gitignore", ".editorconfig"]
 
-version = "1.23.0"
+version = "1.24.0"
 authors = ["Emmie Smith <[email protected]>"]
 edition = "2021"
 

diff --git a/misc/ast-test-template.json b/misc/ast-test-template.json
@@ -10,6 +10,10 @@
                 }
             }
         ],
+        "html-blocks": [
+        ],
+        "code-blocks": [
+        ],
         "table-of-contents": [
         ],
         "footnotes": [

@@ -20,6 +20,16 @@
 
 //! This module provides functions to parse strings into [`IncludeRef`]s
 
+mod parser {
+    // Since pest generates some code that clippy doesn't like
+    #![allow(clippy::empty_docs)]
+
+    #[derive(Parser, Debug)]
+    #[grammar = "includes/grammar.pest"]
+    pub struct IncludeParser;
+}
+
+use self::parser::*;
 use super::IncludeRef;
 use crate::data::{PageRef, PageRefParseError};
 use crate::settings::WikitextSettings;
@@ -28,10 +38,6 @@ use pest::Parser;
 use std::borrow::Cow;
 use std::collections::HashMap;
 
-#[derive(Parser, Debug)]
-#[grammar = "includes/grammar.pest"]
-struct IncludeParser;
-
 /// Parses a single include block in the text.
 ///
 /// # Arguments

@@ -169,6 +169,9 @@ pub enum ParseErrorKind {
     /// Bibliography contains an element other than a definition list.
     BibliographyContainsNonDefinitionList,
 
+    /// Code block has a name which is not unique.
+    CodeNonUniqueName,
+
     /// There is no rule for the block name specified.
     NoSuchBlock,
 

@@ -61,8 +61,8 @@ use crate::next_index::{NextIndex, TableOfContentsIndex};
 use crate::settings::WikitextSettings;
 use crate::tokenizer::Tokenization;
 use crate::tree::{
-    AttributeMap, BibliographyList, Element, LinkLabel, LinkLocation, LinkType, ListItem,
-    ListType, SyntaxTree,
+    AttributeMap, BibliographyList, CodeBlock, Element, LinkLabel, LinkLocation,
+    LinkType, ListItem, ListType, SyntaxTree,
 };
 use std::borrow::Cow;
 
@@ -86,6 +86,8 @@ where
     // Run parsing, get raw results
     let UnstructuredParseResult {
         result,
+        html_blocks,
+        code_blocks,
         table_of_contents_depths,
         footnotes,
         has_footnote_block,
@@ -132,6 +134,7 @@ where
             SyntaxTree::from_element_result(
                 elements,
                 errors,
+                (html_blocks, code_blocks),
                 table_of_contents,
                 footnotes,
                 bibliographies,
@@ -155,6 +158,7 @@ where
             SyntaxTree::from_element_result(
                 elements,
                 errors,
+                (html_blocks, code_blocks),
                 table_of_contents,
                 footnotes,
                 bibliographies,
@@ -180,13 +184,17 @@ where
     let result = gather_paragraphs(&mut parser, RULE_PAGE, NO_CLOSE_CONDITION);
 
     // Build and return
+    let html_blocks = parser.remove_html_blocks();
+    let code_blocks = parser.remove_code_blocks();
     let table_of_contents_depths = parser.remove_table_of_contents();
     let footnotes = parser.remove_footnotes();
     let has_footnote_block = parser.has_footnote_block();
     let bibliographies = parser.remove_bibliographies();
 
     UnstructuredParseResult {
         result,
+        html_blocks,
+        code_blocks,
         table_of_contents_depths,
         footnotes,
         has_footnote_block,
@@ -249,6 +257,12 @@ pub struct UnstructuredParseResult<'r, 't> {
     /// The returned result from parsing.
     pub result: ParseResult<'r, 't, Vec<Element<'t>>>,
 
+    /// The list of HTML blocks to emit from this page.
+    pub html_blocks: Vec<Cow<'t, str>>,
+
+    /// The list of code blocks to emit from this page.
+    pub code_blocks: Vec<CodeBlock<'t>>,
+
     /// The "depths" list for table of content entries.
     ///
     /// Each value is a zero-indexed depth of how

@@ -25,7 +25,10 @@ use super::RULE_PAGE;
 use crate::data::PageInfo;
 use crate::render::text::TextRender;
 use crate::tokenizer::Tokenization;
-use crate::tree::{AcceptsPartial, Bibliography, BibliographyList, HeadingLevel};
+use crate::tree::{
+    AcceptsPartial, Bibliography, BibliographyList, CodeBlock, HeadingLevel,
+};
+use std::borrow::Cow;
 use std::cell::RefCell;
 use std::rc::Rc;
 use std::{mem, ptr};
@@ -58,6 +61,12 @@ pub struct Parser<'r, 't> {
     //       here preserved across parser child instances.
     table_of_contents: Rc<RefCell<Vec<(usize, String)>>>,
 
+    // HTML blocks with data to expose
+    html_blocks: Rc<RefCell<Vec<Cow<'t, str>>>>,
+
+    // Code blocks with data to expose
+    code_blocks: Rc<RefCell<Vec<CodeBlock<'t>>>>,
+
     // Footnotes
     //
     // Schema: Vec<List of elements in a footnote>
@@ -102,6 +111,8 @@ impl<'r, 't> Parser<'r, 't> {
             rule: RULE_PAGE,
             depth: 0,
             table_of_contents: make_shared_vec(),
+            html_blocks: make_shared_vec(),
+            code_blocks: make_shared_vec(),
             footnotes: make_shared_vec(),
             bibliographies: Rc::new(RefCell::new(BibliographyList::new())),
             accepts_partial: AcceptsPartial::None,
@@ -221,6 +232,16 @@ impl<'r, 't> Parser<'r, 't> {
         self.table_of_contents.borrow_mut().push((level, name));
     }
 
+    #[cold]
+    pub fn remove_html_blocks(&mut self) -> Vec<Cow<'t, str>> {
+        mem::take(&mut self.html_blocks.borrow_mut())
+    }
+
+    #[cold]
+    pub fn remove_code_blocks(&mut self) -> Vec<CodeBlock<'t>> {
+        mem::take(&mut self.code_blocks.borrow_mut())
+    }
+
     #[cold]
     pub fn remove_table_of_contents(&mut self) -> Vec<(usize, String)> {
         mem::take(&mut self.table_of_contents.borrow_mut())
@@ -236,6 +257,34 @@ impl<'r, 't> Parser<'r, 't> {
         mem::take(&mut self.footnotes.borrow_mut())
     }
 
+    // Blocks
+    pub fn push_html_block(&mut self, new_block: Cow<'t, str>) {
+        self.html_blocks.borrow_mut().push(new_block);
+    }
+
+    pub fn push_code_block(
+        &mut self,
+        new_block: CodeBlock<'t>,
+    ) -> Result<(), NonUniqueNameError> {
+        // Check name (if specified) is unique
+        {
+            let guard = self.code_blocks.borrow();
+            if let Some(ref new_name) = new_block.name {
+                for block in &*guard {
+                    if let Some(ref name) = block.name {
+                        if name == new_name {
+                            return Err(NonUniqueNameError);
+                        }
+                    }
+                }
+            }
+        }
+
+        // Add block
+        self.code_blocks.borrow_mut().push(new_block);
+        Ok(())
+    }
+
     // Bibliography
     pub fn push_bibliography(&mut self, bibliography: Bibliography<'t>) -> usize {
         let mut guard = self.bibliographies.borrow_mut();
@@ -252,10 +301,16 @@ impl<'r, 't> Parser<'r, 't> {
     // Special for [[include]], appending a SyntaxTree
     pub fn append_shared_items(
         &mut self,
+        html_blocks: &mut Vec<Cow<'t, str>>,
+        code_blocks: &mut Vec<CodeBlock<'t>>,
         table_of_contents: &mut Vec<(usize, String)>,
         footnotes: &mut Vec<Vec<Element<'t>>>,
         bibliographies: &mut BibliographyList<'t>,
     ) {
+        self.html_blocks.borrow_mut().append(html_blocks);
+
+        self.code_blocks.borrow_mut().append(code_blocks);
+
         self.table_of_contents
             .borrow_mut()
             .append(table_of_contents);
@@ -516,6 +571,9 @@ impl<'r, 't> Parser<'r, 't> {
     }
 }
 
+#[derive(Debug)]
+pub struct NonUniqueNameError;
+
 #[inline]
 fn make_shared_vec<T>() -> Rc<RefCell<Vec<T>>> {
     Rc::new(RefCell::new(Vec::new()))

@@ -19,6 +19,8 @@
  */
 
 use super::prelude::*;
+use crate::tree::CodeBlock;
+use wikidot_normalize::normalize;
 
 pub const BLOCK_CODE: BlockRule = BlockRule {
     name: "block-code",
@@ -42,13 +44,29 @@ fn parse_fn<'r, 't>(
     assert_block_name(&BLOCK_CODE, name);
 
     let mut arguments = parser.get_head_map(&BLOCK_CODE, in_head)?;
-    let language = arguments.get("type");
+
+    let mut language = arguments.get("type");
+    if let Some(ref mut language) = language {
+        language.to_mut().make_ascii_lowercase();
+    }
+
+    let mut name = arguments.get("name");
+    if let Some(ref mut name) = name {
+        normalize(name.to_mut());
+    }
 
     let code = parser.get_body_text(&BLOCK_CODE)?;
     let element = Element::Code {
         contents: cow!(code),
         language,
     };
+    let added_result = parser.push_code_block(CodeBlock {
+        contents: cow!(code),
+        name,
+    });
+    if added_result.is_err() {
+        return Err(parser.make_err(ParseErrorKind::CodeNonUniqueName));
+    }
 
     ok!(element)
 }
@@ -46,6 +46,7 @@ fn parse_fn<'r, 't>(
     let element = Element::Html {
         contents: cow!(html),
     };
+    parser.push_html_block(cow!(html));
 
     ok!(element)
 }
@@ -22,6 +22,8 @@ use super::prelude::*;
 use crate::data::PageRef;
 use crate::parsing::UnstructuredParseResult;
 
+// TODO: maybe scrap this? we want to move to components anyways
+
 /// Block rule for include (elements).
 ///
 /// This takes the resultant `SyntaxTree` from another page and
@@ -60,6 +62,8 @@ fn parse_fn<'r, 't>(
     // Get page to be included
     let UnstructuredParseResult {
         result,
+        mut html_blocks,
+        mut code_blocks,
         mut table_of_contents_depths,
         mut footnotes,
         has_footnote_block,
@@ -80,6 +84,8 @@ fn parse_fn<'r, 't>(
 
     // Update parser state, build, and return
     parser.append_shared_items(
+        &mut html_blocks,
+        &mut code_blocks,
         &mut table_of_contents_depths,
         &mut footnotes,
         &mut bibliographies,
@@ -112,6 +118,8 @@ fn include_page<'r, 't>(
             vec![],
             false,
         )),
+        html_blocks: vec![],
+        code_blocks: vec![],
         table_of_contents_depths: vec![],
         footnotes: vec![],
         has_footnote_block: false,

@@ -22,8 +22,8 @@
 mod test;
 
 mod lexer {
-    // Since pest makes enums automatically that clippy doesn't like
-    #![allow(clippy::upper_case_acronyms)]
+    // Since pest generates some code that clippy doesn't like
+    #![allow(clippy::upper_case_acronyms, clippy::empty_docs)]
 
     // The actual parser definition, which we will re-export
     #[derive(Parser, Debug)]

@@ -29,6 +29,7 @@ fn html() {
     let result = SyntaxTree::from_element_result(
         vec![],
         vec![],
+        (vec![], vec![]),
         vec![],
         vec![],
         BibliographyList::new(),

@@ -50,6 +50,7 @@ fn null() {
     let result = SyntaxTree::from_element_result(
         vec![],
         vec![],
+        (vec![], vec![]),
         vec![],
         vec![],
         BibliographyList::new(),

@@ -403,6 +403,8 @@ fn arb_tree() -> impl Strategy<Value = SyntaxTree<'static>> {
         .prop_map(|(elements, table_of_contents, footnotes, wikitext_len)| {
             SyntaxTree {
                 elements,
+                html_blocks: Vec::new(),
+                code_blocks: Vec::new(), // these two are derived fields
                 table_of_contents,
                 footnotes,
                 bibliographies: BibliographyList::new(), // not bothering right now

@@ -0,0 +1,39 @@
+/*
+ * code.rs
+ *
+ * ftml - Library to parse Wikidot text
+ * Copyright (C) 2019-2024 Wikijump Team
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+//! Structure to represent a code block.
+
+use super::clone::{option_string_to_owned, string_to_owned};
+use std::borrow::Cow;
+
+#[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq, Eq)]
+pub struct CodeBlock<'t> {
+    pub contents: Cow<'t, str>,
+    pub name: Option<Cow<'t, str>>,
+}
+
+impl<'t> CodeBlock<'t> {
+    pub fn to_owned(&self) -> CodeBlock<'static> {
+        CodeBlock {
+            contents: string_to_owned(&self.contents),
+            name: option_string_to_owned(&self.name),
+        }
+    }
+}