--- a/includes/wikiengine/parse_mediawiki.php Sun Mar 28 21:49:26 2010 -0400
+++ b/includes/wikiengine/parse_mediawiki.php Sun Mar 28 23:10:46 2010 -0400
@@ -13,278 +13,278 @@
class Carpenter_Parse_MediaWiki
{
- public $rules = array(
- 'bold' => "/'''(.+?)'''/",
- 'italic' => "/''(.+?)''/",
- 'underline' => '/__(.+?)__/',
- 'externalwithtext' => '#\[((?:https?|irc|ftp)://.+?) (.+?)\]#',
- 'externalnotext' => '#\[((?:https?|irc|ftp)://.+?)\]#',
- 'mailtonotext' => '#\[mailto:([^ \]]+?)\]#',
- 'mailtowithtext' => '#\[mailto:([^ \]]+?) (.+?)\]#',
- 'hr' => '/^[-]{4,} *$/m',
- 'code' => '/^(?:<code>(?:\r?\n)?|<pre>)(.+?)(?:<\/pre>|(?:\r?\n)?<\/code>)$/mis'
- );
-
- private $blockquote_rand_id;
-
- public function lang(&$text)
- {
- global $lang;
-
- preg_match_all('/<lang (?:code|id)="([a-z0-9_-]+)">([\w\W]+?)<\/lang>/', $text, $langmatch);
- foreach ( $langmatch[0] as $i => $match )
- {
- if ( $langmatch[1][$i] == $lang->lang_code )
- {
- $text = str_replace_once($match, $langmatch[2][$i], $text);
- }
- else
- {
- $text = str_replace_once($match, '', $text);
- }
- }
-
- return array();
- }
-
- public function templates(&$text)
- {
- $template_regex = "/\{\{(.+)((\n|\|[ ]*([A-z0-9]+)[ ]*=[ ]*(.+))*)\}\}/isU";
- $i = 0;
- while ( preg_match($template_regex, $text, $match) )
- {
- $i++;
- if ( $i == 5 )
- break;
- $text = RenderMan::include_templates($text);
- }
-
- return array();
- }
-
- public function heading(&$text)
- {
- if ( !preg_match_all('/^(={1,6}) *(.+?) *\\1 *$/m', $text, $results) )
- return array();
-
- $headings = array();
- foreach ( $results[0] as $i => $match )
- {
- $headings[] = array(
- 'level' => strlen($results[1][$i]),
- 'text' => $results[2][$i]
- );
- }
-
- $text = Carpenter::tokenize($text, $results[0]);
-
- return $headings;
- }
-
- public function multilist(&$text)
- {
- // Match entire lists
- $regex = '/^
- ([:#\*])+ # Initial list delimiter
- [ ]*
- .+?
- (?:
- \r?\n
- (?:\\1|[ ]{2,})
- [ ]*
- .+?)*
- $/mx';
-
- if ( !preg_match_all($regex, $text, $lists) )
- return array();
-
- $types = array(
- '*' => 'unordered',
- '#' => 'ordered',
- ':' => 'indent'
- );
-
- $pieces = array();
- foreach ( $lists[0] as $i => $list )
- {
- $token = $lists[1][$i];
- $piece = array(
- 'type' => $types[$token],
- 'items' => array()
- );
-
- // convert windows newlines to unix
- $list = str_replace("\r\n", "\n", $list);
- $items_pre = explode("\n", $list);
- $items = array();
- // first pass, go through and combine items that are newlined
- foreach ( $items_pre as $item )
- {
- if ( substr($item, 0, 1) == $token )
- {
- $items[] = $item;
- }
- else
- {
- // it's a continuation of the previous LI. Don't need to worry about
- // undefined indices here since the regex should filter out all invalid
- // markup. Just append this line to the previous.
- $items[ count($items) - 1 ] .= "\n" . trim($item);
- }
- }
-
- // second pass, separate items and tokens
- unset($items_pre);
- foreach ( $items as $item )
- {
- // get the depth
- $itemtoken = preg_replace('/^([#:\*]+).*$/s', '$1', $item);
- // get the text
- $itemtext = trim(substr($item, strlen($itemtoken)));
- $piece['items'][] = array(
- // depth starts at 1
- 'depth' => strlen($itemtoken),
- 'text' => $itemtext
- );
- }
- $pieces[] = $piece;
- }
-
- $text = Carpenter::tokenize($text, $lists[0]);
-
- return $pieces;
- }
-
- public function blockquote(&$text)
- {
- $rand_id = hexencode(AESCrypt::randkey(16), '', '');
-
- while ( preg_match_all('/^(?:(>+) *.+(?:\r?\n|$))+/m', $text, $quotes) )
- {
- foreach ( $quotes[0] as $quote )
- {
- $piece = trim(preg_replace('/^> */m', '', $quote));
- $text = str_replace_once($quote, "{blockquote:$rand_id}\n$piece\n{/blockquote:$rand_id}\n", $text);
- }
- }
- //die('<pre>' . htmlspecialchars($text) . '</pre>');
-
- $this->blockquote_rand_id = $rand_id;
- }
-
- public function blockquotepost(&$text)
- {
- return $this->blockquote_rand_id;
- }
-
- public function paragraph(&$text)
- {
- // The trick with paragraphs is to not turn things into them when a block level element already wraps the block of text.
- // First we need a list of block level elements (http://htmlhelp.com/reference/html40/block.html + some Enano extensions)
- $blocklevel = 'address|blockquote|center|code|div|dl|fieldset|form|h1|h2|h3|h4|h5|h6|hr|li|ol|p|pre|table|ul|tr|td|th|tbody|thead|tfoot';
-
- // Wrap all block level tags
- RenderMan::tag_strip('_paragraph_bypass', $text, $_nw);
-
- // Find all opening and closing tags
-
- $regex = ";(<(?:/(?:$blocklevel)|(?:$blocklevel)(?: [^>]*?)?)>);s";
-
- // oh. and we're using this tokens thing because for identical matches, the first match will
- // get wrapped X number of times instead of all matches getting wrapped once; replacing each
- // with a unique token id remedies this
-
- $tokens = array();
- $rand_id = sha1(microtime() . mt_rand());
- $tag_stack = array();
-
- if ( $text_split = preg_split($regex, $text, -1, PREG_SPLIT_DELIM_CAPTURE) )
- {
- $text = '';
- // go through the text, extract tag names, and push them to a stack.
- foreach ( $text_split as $splitpart )
- {
- if ( preg_match(";^<(/)?($blocklevel)( |>);i", $splitpart, $match) )
- {
- $tagname = $match[2];
- if ( $match[1] == '/' )
- {
- // closing tag
- if ( $tagname != ($top = array_pop($tag_stack)) )
- {
- // invalid - push back
- array_push($tag_stack, $top);
- }
- else
- {
- // valid - if stack's at zero, add a </_paragraph_bypass>
- if ( count($tag_stack) == 0 )
- $splitpart .= '</_paragraph_bypass>';
- }
- }
- else
- {
- // push
- array_push($tag_stack, $tagname);
- if ( count($tag_stack) == 1 )
- $splitpart = '<_paragraph_bypass>' . $splitpart;
- }
- }
- $text .= $splitpart;
- }
- //echo '<pre>' . htmlspecialchars(print_r($text, true)) . '</pre>';
- }
-
- // All things that should be para-bypassed now are surrounded by _paragraph_bypass tags.
-
- // die('<pre>' . htmlspecialchars($text) . '</pre>');
+ public $rules = array(
+ 'bold' => "/'''(.+?)'''/",
+ 'italic' => "/''(.+?)''/",
+ 'underline' => '/__(.+?)__/',
+ 'externalwithtext' => '#\[((?:https?|irc|ftp)://.+?) (.+?)\]#',
+ 'externalnotext' => '#\[((?:https?|irc|ftp)://.+?)\]#',
+ 'mailtonotext' => '#\[mailto:([^ \]]+?)\]#',
+ 'mailtowithtext' => '#\[mailto:([^ \]]+?) (.+?)\]#',
+ 'hr' => '/^[-]{4,} *$/m',
+ 'code' => '/^(?:<code>(?:\r?\n)?|<pre>)(.+?)(?:<\/pre>|(?:\r?\n)?<\/code>)$/mis'
+ );
+
+ private $blockquote_rand_id;
+
+ public function lang(&$text)
+ {
+ global $lang;
+
+ preg_match_all('/<lang (?:code|id)="([a-z0-9_-]+)">([\w\W]+?)<\/lang>/', $text, $langmatch);
+ foreach ( $langmatch[0] as $i => $match )
+ {
+ if ( $langmatch[1][$i] == $lang->lang_code )
+ {
+ $text = str_replace_once($match, $langmatch[2][$i], $text);
+ }
+ else
+ {
+ $text = str_replace_once($match, '', $text);
+ }
+ }
+
+ return array();
+ }
+
+ public function templates(&$text)
+ {
+ $template_regex = "/\{\{(.+)((\n|\|[ ]*([A-z0-9]+)[ ]*=[ ]*(.+))*)\}\}/isU";
+ $i = 0;
+ while ( preg_match($template_regex, $text, $match) )
+ {
+ $i++;
+ if ( $i == 5 )
+ break;
+ $text = RenderMan::include_templates($text);
+ }
+
+ return array();
+ }
+
+ public function heading(&$text)
+ {
+ if ( !preg_match_all('/^(={1,6}) *(.+?) *\\1 *$/m', $text, $results) )
+ return array();
+
+ $headings = array();
+ foreach ( $results[0] as $i => $match )
+ {
+ $headings[] = array(
+ 'level' => strlen($results[1][$i]),
+ 'text' => $results[2][$i]
+ );
+ }
+
+ $text = Carpenter::tokenize($text, $results[0]);
+
+ return $headings;
+ }
- RenderMan::tag_unstrip('_paragraph_bypass', $text, $_nw, true);
-
- // This is potentially a hack. It allows the parser to stick in <_paragraph_bypass> tags
- // to prevent the paragraph parser from interfering with pretty HTML generated elsewhere.
- RenderMan::tag_strip('_paragraph_bypass', $text, $_nw);
-
- $startcond = "(?!(?:[\\r\\n]|\{_paragraph_bypass:[a-f0-9]{32}:[0-9]+\}|[ ]*<\/?(?:$blocklevel)(?: .+>|>)))";
- $regex = "/^
- $startcond # line start condition - do not match if the line starts with the condition above
- .+? # body text
- (?:
- \\n # additional lines
- $startcond # make sure of only one newline in a row, and end the paragraph if a new line fails the start condition
- .*?
- )* # keep going until it fails
- $
- /mx";
-
- if ( !preg_match_all($regex, $text, $matches) )
- {
- RenderMan::tag_unstrip('_paragraph_bypass', $text, $_nw);
- return array();
- }
-
- // Debugging :)
- // die('<pre>' . htmlspecialchars($text) . "\n-----------------------------------------------------------\n" . htmlspecialchars(print_r($matches, true)) . '</pre>');
-
- // restore stripped
- RenderMan::tag_unstrip('_paragraph_bypass', $text, $_nw);
-
- // tokenize
- $text = Carpenter::tokenize($text, $matches[0]);
-
- return $matches[0];
- }
+ public function multilist(&$text)
+ {
+ // Match entire lists
+ $regex = '/^
+ ([:#\*])+ # Initial list delimiter
+ [ ]*
+ .+?
+ (?:
+ \r?\n
+ (?:\\1|[ ]{2,})
+ [ ]*
+ .+?)*
+ $/mx';
+
+ if ( !preg_match_all($regex, $text, $lists) )
+ return array();
+
+ $types = array(
+ '*' => 'unordered',
+ '#' => 'ordered',
+ ':' => 'indent'
+ );
+
+ $pieces = array();
+ foreach ( $lists[0] as $i => $list )
+ {
+ $token = $lists[1][$i];
+ $piece = array(
+ 'type' => $types[$token],
+ 'items' => array()
+ );
+
+ // convert windows newlines to unix
+ $list = str_replace("\r\n", "\n", $list);
+ $items_pre = explode("\n", $list);
+ $items = array();
+ // first pass, go through and combine items that are newlined
+ foreach ( $items_pre as $item )
+ {
+ if ( substr($item, 0, 1) == $token )
+ {
+ $items[] = $item;
+ }
+ else
+ {
+ // it's a continuation of the previous LI. Don't need to worry about
+ // undefined indices here since the regex should filter out all invalid
+ // markup. Just append this line to the previous.
+ $items[ count($items) - 1 ] .= "\n" . trim($item);
+ }
+ }
+
+ // second pass, separate items and tokens
+ unset($items_pre);
+ foreach ( $items as $item )
+ {
+ // get the depth
+ $itemtoken = preg_replace('/^([#:\*]+).*$/s', '$1', $item);
+ // get the text
+ $itemtext = trim(substr($item, strlen($itemtoken)));
+ $piece['items'][] = array(
+ // depth starts at 1
+ 'depth' => strlen($itemtoken),
+ 'text' => $itemtext
+ );
+ }
+ $pieces[] = $piece;
+ }
+
+ $text = Carpenter::tokenize($text, $lists[0]);
+
+ return $pieces;
+ }
+
+ public function blockquote(&$text)
+ {
+ $rand_id = hexencode(AESCrypt::randkey(16), '', '');
+
+ while ( preg_match_all('/^(?:(>+) *.+(?:\r?\n|$))+/m', $text, $quotes) )
+ {
+ foreach ( $quotes[0] as $quote )
+ {
+ $piece = trim(preg_replace('/^> */m', '', $quote));
+ $text = str_replace_once($quote, "{blockquote:$rand_id}\n$piece\n{/blockquote:$rand_id}\n", $text);
+ }
+ }
+ //die('<pre>' . htmlspecialchars($text) . '</pre>');
+
+ $this->blockquote_rand_id = $rand_id;
+ }
+
+ public function blockquotepost(&$text)
+ {
+ return $this->blockquote_rand_id;
+ }
+
+ public function paragraph(&$text)
+ {
+ // The trick with paragraphs is to not turn things into them when a block level element already wraps the block of text.
+ // First we need a list of block level elements (http://htmlhelp.com/reference/html40/block.html + some Enano extensions)
+ $blocklevel = 'address|blockquote|center|code|div|dl|fieldset|form|h1|h2|h3|h4|h5|h6|hr|li|ol|p|pre|table|ul|tr|td|th|tbody|thead|tfoot';
+
+ // Wrap all block level tags
+ RenderMan::tag_strip('_paragraph_bypass', $text, $_nw);
+
+ // Find all opening and closing tags
+
+ $regex = ";(<(?:/(?:$blocklevel)|(?:$blocklevel)(?: [^>]*?)?)>);s";
+
+ // oh. and we're using this tokens thing because for identical matches, the first match will
+ // get wrapped X number of times instead of all matches getting wrapped once; replacing each
+ // with a unique token id remedies this
+
+ $tokens = array();
+ $rand_id = sha1(microtime() . mt_rand());
+ $tag_stack = array();
+
+ if ( $text_split = preg_split($regex, $text, -1, PREG_SPLIT_DELIM_CAPTURE) )
+ {
+ $text = '';
+ // go through the text, extract tag names, and push them to a stack.
+ foreach ( $text_split as $splitpart )
+ {
+ if ( preg_match(";^<(/)?($blocklevel)( |>);i", $splitpart, $match) )
+ {
+ $tagname = $match[2];
+ if ( $match[1] == '/' )
+ {
+ // closing tag
+ if ( $tagname != ($top = array_pop($tag_stack)) )
+ {
+ // invalid - push back
+ array_push($tag_stack, $top);
+ }
+ else
+ {
+ // valid - if stack's at zero, add a </_paragraph_bypass>
+ if ( count($tag_stack) == 0 )
+ $splitpart .= '</_paragraph_bypass>';
+ }
+ }
+ else
+ {
+ // push
+ array_push($tag_stack, $tagname);
+ if ( count($tag_stack) == 1 )
+ $splitpart = '<_paragraph_bypass>' . $splitpart;
+ }
+ }
+ $text .= $splitpart;
+ }
+ //echo '<pre>' . htmlspecialchars(print_r($text, true)) . '</pre>';
+ }
+
+ // All things that should be para-bypassed now are surrounded by _paragraph_bypass tags.
+
+ // die('<pre>' . htmlspecialchars($text) . '</pre>');
+
+ RenderMan::tag_unstrip('_paragraph_bypass', $text, $_nw, true);
+
+ // This is potentially a hack. It allows the parser to stick in <_paragraph_bypass> tags
+ // to prevent the paragraph parser from interfering with pretty HTML generated elsewhere.
+ RenderMan::tag_strip('_paragraph_bypass', $text, $_nw);
+
+ $startcond = "(?!(?:[\\r\\n]|\{_paragraph_bypass:[a-f0-9]{32}:[0-9]+\}|[ ]*<\/?(?:$blocklevel)(?: .+>|>)))";
+ $regex = "/^
+ $startcond # line start condition - do not match if the line starts with the condition above
+ .+? # body text
+ (?:
+ \\n # additional lines
+ $startcond # make sure of only one newline in a row, and end the paragraph if a new line fails the start condition
+ .*?
+ )* # keep going until it fails
+ $
+ /mx";
+
+ if ( !preg_match_all($regex, $text, $matches) )
+ {
+ RenderMan::tag_unstrip('_paragraph_bypass', $text, $_nw);
+ return array();
+ }
+
+ // Debugging :)
+ // die('<pre>' . htmlspecialchars($text) . "\n-----------------------------------------------------------\n" . htmlspecialchars(print_r($matches, true)) . '</pre>');
+
+ // restore stripped
+ RenderMan::tag_unstrip('_paragraph_bypass', $text, $_nw);
+
+ // tokenize
+ $text = Carpenter::tokenize($text, $matches[0]);
+
+ return $matches[0];
+ }
}
function parser_mediawiki_xhtml_image($text)
{
- $text = RenderMan::process_image_tags($text, $taglist);
- $text = RenderMan::process_imgtags_stage2($text, $taglist);
- return $text;
+ $text = RenderMan::process_image_tags($text, $taglist);
+ $text = RenderMan::process_imgtags_stage2($text, $taglist);
+ return $text;
}
function parser_mediawiki_xhtml_tables($text)
{
- return process_tables($text);
+ return process_tables($text);
}