11 * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for details. |
11 * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for details. |
12 */ |
12 */ |
13 |
13 |
14 class Carpenter_Parse_MediaWiki |
14 class Carpenter_Parse_MediaWiki |
15 { |
15 { |
16 public $rules = array( |
16 public $rules = array( |
17 'bold' => "/'''(.+?)'''/", |
17 'bold' => "/'''(.+?)'''/", |
18 'italic' => "/''(.+?)''/", |
18 'italic' => "/''(.+?)''/", |
19 'underline' => '/__(.+?)__/', |
19 'underline' => '/__(.+?)__/', |
20 'externalwithtext' => '#\[((?:https?|irc|ftp)://.+?) (.+?)\]#', |
20 'externalwithtext' => '#\[((?:https?|irc|ftp)://.+?) (.+?)\]#', |
21 'externalnotext' => '#\[((?:https?|irc|ftp)://.+?)\]#', |
21 'externalnotext' => '#\[((?:https?|irc|ftp)://.+?)\]#', |
22 'mailtonotext' => '#\[mailto:([^ \]]+?)\]#', |
22 'mailtonotext' => '#\[mailto:([^ \]]+?)\]#', |
23 'mailtowithtext' => '#\[mailto:([^ \]]+?) (.+?)\]#', |
23 'mailtowithtext' => '#\[mailto:([^ \]]+?) (.+?)\]#', |
24 'hr' => '/^[-]{4,} *$/m', |
24 'hr' => '/^[-]{4,} *$/m', |
25 'code' => '/^(?:<code>(?:\r?\n)?|<pre>)(.+?)(?:<\/pre>|(?:\r?\n)?<\/code>)$/mis' |
25 'code' => '/^(?:<code>(?:\r?\n)?|<pre>)(.+?)(?:<\/pre>|(?:\r?\n)?<\/code>)$/mis' |
26 ); |
26 ); |
27 |
27 |
28 private $blockquote_rand_id; |
28 private $blockquote_rand_id; |
29 |
29 |
30 public function lang(&$text) |
30 public function lang(&$text) |
31 { |
31 { |
32 global $lang; |
32 global $lang; |
33 |
33 |
34 preg_match_all('/<lang (?:code|id)="([a-z0-9_-]+)">([\w\W]+?)<\/lang>/', $text, $langmatch); |
34 preg_match_all('/<lang (?:code|id)="([a-z0-9_-]+)">([\w\W]+?)<\/lang>/', $text, $langmatch); |
35 foreach ( $langmatch[0] as $i => $match ) |
35 foreach ( $langmatch[0] as $i => $match ) |
36 { |
36 { |
37 if ( $langmatch[1][$i] == $lang->lang_code ) |
37 if ( $langmatch[1][$i] == $lang->lang_code ) |
38 { |
38 { |
39 $text = str_replace_once($match, $langmatch[2][$i], $text); |
39 $text = str_replace_once($match, $langmatch[2][$i], $text); |
40 } |
40 } |
41 else |
41 else |
42 { |
42 { |
43 $text = str_replace_once($match, '', $text); |
43 $text = str_replace_once($match, '', $text); |
44 } |
44 } |
45 } |
45 } |
46 |
46 |
47 return array(); |
47 return array(); |
48 } |
48 } |
49 |
49 |
50 public function templates(&$text) |
50 public function templates(&$text) |
51 { |
51 { |
52 $template_regex = "/\{\{(.+)((\n|\|[ ]*([A-z0-9]+)[ ]*=[ ]*(.+))*)\}\}/isU"; |
52 $template_regex = "/\{\{(.+)((\n|\|[ ]*([A-z0-9]+)[ ]*=[ ]*(.+))*)\}\}/isU"; |
53 $i = 0; |
53 $i = 0; |
54 while ( preg_match($template_regex, $text, $match) ) |
54 while ( preg_match($template_regex, $text, $match) ) |
55 { |
55 { |
56 $i++; |
56 $i++; |
57 if ( $i == 5 ) |
57 if ( $i == 5 ) |
58 break; |
58 break; |
59 $text = RenderMan::include_templates($text); |
59 $text = RenderMan::include_templates($text); |
60 } |
60 } |
61 |
61 |
62 return array(); |
62 return array(); |
63 } |
63 } |
64 |
64 |
65 public function heading(&$text) |
65 public function heading(&$text) |
66 { |
66 { |
67 if ( !preg_match_all('/^(={1,6}) *(.+?) *\\1 *$/m', $text, $results) ) |
67 if ( !preg_match_all('/^(={1,6}) *(.+?) *\\1 *$/m', $text, $results) ) |
68 return array(); |
68 return array(); |
69 |
69 |
70 $headings = array(); |
70 $headings = array(); |
71 foreach ( $results[0] as $i => $match ) |
71 foreach ( $results[0] as $i => $match ) |
72 { |
72 { |
73 $headings[] = array( |
73 $headings[] = array( |
74 'level' => strlen($results[1][$i]), |
74 'level' => strlen($results[1][$i]), |
75 'text' => $results[2][$i] |
75 'text' => $results[2][$i] |
76 ); |
76 ); |
77 } |
77 } |
78 |
78 |
79 $text = Carpenter::tokenize($text, $results[0]); |
79 $text = Carpenter::tokenize($text, $results[0]); |
80 |
80 |
81 return $headings; |
81 return $headings; |
82 } |
82 } |
83 |
83 |
84 public function multilist(&$text) |
84 public function multilist(&$text) |
85 { |
85 { |
86 // Match entire lists |
86 // Match entire lists |
87 $regex = '/^ |
87 $regex = '/^ |
88 ([:#\*])+ # Initial list delimiter |
88 ([:#\*])+ # Initial list delimiter |
89 [ ]* |
89 [ ]* |
90 .+? |
90 .+? |
91 (?: |
91 (?: |
92 \r?\n |
92 \r?\n |
93 (?:\\1|[ ]{2,}) |
93 (?:\\1|[ ]{2,}) |
94 [ ]* |
94 [ ]* |
95 .+?)* |
95 .+?)* |
96 $/mx'; |
96 $/mx'; |
97 |
97 |
98 if ( !preg_match_all($regex, $text, $lists) ) |
98 if ( !preg_match_all($regex, $text, $lists) ) |
99 return array(); |
99 return array(); |
100 |
100 |
101 $types = array( |
101 $types = array( |
102 '*' => 'unordered', |
102 '*' => 'unordered', |
103 '#' => 'ordered', |
103 '#' => 'ordered', |
104 ':' => 'indent' |
104 ':' => 'indent' |
105 ); |
105 ); |
106 |
106 |
107 $pieces = array(); |
107 $pieces = array(); |
108 foreach ( $lists[0] as $i => $list ) |
108 foreach ( $lists[0] as $i => $list ) |
109 { |
109 { |
110 $token = $lists[1][$i]; |
110 $token = $lists[1][$i]; |
111 $piece = array( |
111 $piece = array( |
112 'type' => $types[$token], |
112 'type' => $types[$token], |
113 'items' => array() |
113 'items' => array() |
114 ); |
114 ); |
115 |
115 |
116 // convert windows newlines to unix |
116 // convert windows newlines to unix |
117 $list = str_replace("\r\n", "\n", $list); |
117 $list = str_replace("\r\n", "\n", $list); |
118 $items_pre = explode("\n", $list); |
118 $items_pre = explode("\n", $list); |
119 $items = array(); |
119 $items = array(); |
120 // first pass, go through and combine items that are newlined |
120 // first pass, go through and combine items that are newlined |
121 foreach ( $items_pre as $item ) |
121 foreach ( $items_pre as $item ) |
122 { |
122 { |
123 if ( substr($item, 0, 1) == $token ) |
123 if ( substr($item, 0, 1) == $token ) |
124 { |
124 { |
125 $items[] = $item; |
125 $items[] = $item; |
126 } |
126 } |
127 else |
127 else |
128 { |
128 { |
129 // it's a continuation of the previous LI. Don't need to worry about |
129 // it's a continuation of the previous LI. Don't need to worry about |
130 // undefined indices here since the regex should filter out all invalid |
130 // undefined indices here since the regex should filter out all invalid |
131 // markup. Just append this line to the previous. |
131 // markup. Just append this line to the previous. |
132 $items[ count($items) - 1 ] .= "\n" . trim($item); |
132 $items[ count($items) - 1 ] .= "\n" . trim($item); |
133 } |
133 } |
134 } |
134 } |
135 |
135 |
136 // second pass, separate items and tokens |
136 // second pass, separate items and tokens |
137 unset($items_pre); |
137 unset($items_pre); |
138 foreach ( $items as $item ) |
138 foreach ( $items as $item ) |
139 { |
139 { |
140 // get the depth |
140 // get the depth |
141 $itemtoken = preg_replace('/^([#:\*]+).*$/s', '$1', $item); |
141 $itemtoken = preg_replace('/^([#:\*]+).*$/s', '$1', $item); |
142 // get the text |
142 // get the text |
143 $itemtext = trim(substr($item, strlen($itemtoken))); |
143 $itemtext = trim(substr($item, strlen($itemtoken))); |
144 $piece['items'][] = array( |
144 $piece['items'][] = array( |
145 // depth starts at 1 |
145 // depth starts at 1 |
146 'depth' => strlen($itemtoken), |
146 'depth' => strlen($itemtoken), |
147 'text' => $itemtext |
147 'text' => $itemtext |
148 ); |
148 ); |
149 } |
149 } |
150 $pieces[] = $piece; |
150 $pieces[] = $piece; |
151 } |
151 } |
152 |
152 |
153 $text = Carpenter::tokenize($text, $lists[0]); |
153 $text = Carpenter::tokenize($text, $lists[0]); |
154 |
154 |
155 return $pieces; |
155 return $pieces; |
156 } |
156 } |
157 |
157 |
158 public function blockquote(&$text) |
158 public function blockquote(&$text) |
159 { |
159 { |
160 $rand_id = hexencode(AESCrypt::randkey(16), '', ''); |
160 $rand_id = hexencode(AESCrypt::randkey(16), '', ''); |
161 |
161 |
162 while ( preg_match_all('/^(?:(>+) *.+(?:\r?\n|$))+/m', $text, $quotes) ) |
162 while ( preg_match_all('/^(?:(>+) *.+(?:\r?\n|$))+/m', $text, $quotes) ) |
163 { |
163 { |
164 foreach ( $quotes[0] as $quote ) |
164 foreach ( $quotes[0] as $quote ) |
165 { |
165 { |
166 $piece = trim(preg_replace('/^> */m', '', $quote)); |
166 $piece = trim(preg_replace('/^> */m', '', $quote)); |
167 $text = str_replace_once($quote, "{blockquote:$rand_id}\n$piece\n{/blockquote:$rand_id}\n", $text); |
167 $text = str_replace_once($quote, "{blockquote:$rand_id}\n$piece\n{/blockquote:$rand_id}\n", $text); |
168 } |
168 } |
169 } |
169 } |
170 //die('<pre>' . htmlspecialchars($text) . '</pre>'); |
170 //die('<pre>' . htmlspecialchars($text) . '</pre>'); |
171 |
171 |
172 $this->blockquote_rand_id = $rand_id; |
172 $this->blockquote_rand_id = $rand_id; |
173 } |
173 } |
174 |
174 |
175 public function blockquotepost(&$text) |
175 public function blockquotepost(&$text) |
176 { |
176 { |
177 return $this->blockquote_rand_id; |
177 return $this->blockquote_rand_id; |
178 } |
178 } |
179 |
179 |
180 public function paragraph(&$text) |
180 public function paragraph(&$text) |
181 { |
181 { |
182 // The trick with paragraphs is to not turn things into them when a block level element already wraps the block of text. |
182 // The trick with paragraphs is to not turn things into them when a block level element already wraps the block of text. |
183 // First we need a list of block level elements (http://htmlhelp.com/reference/html40/block.html + some Enano extensions) |
183 // First we need a list of block level elements (http://htmlhelp.com/reference/html40/block.html + some Enano extensions) |
184 $blocklevel = 'address|blockquote|center|code|div|dl|fieldset|form|h1|h2|h3|h4|h5|h6|hr|li|ol|p|pre|table|ul|tr|td|th|tbody|thead|tfoot'; |
184 $blocklevel = 'address|blockquote|center|code|div|dl|fieldset|form|h1|h2|h3|h4|h5|h6|hr|li|ol|p|pre|table|ul|tr|td|th|tbody|thead|tfoot'; |
185 |
185 |
186 // Wrap all block level tags |
186 // Wrap all block level tags |
187 RenderMan::tag_strip('_paragraph_bypass', $text, $_nw); |
187 RenderMan::tag_strip('_paragraph_bypass', $text, $_nw); |
188 |
188 |
189 // Find all opening and closing tags |
189 // Find all opening and closing tags |
190 |
190 |
191 $regex = ";(<(?:/(?:$blocklevel)|(?:$blocklevel)(?: [^>]*?)?)>);s"; |
191 $regex = ";(<(?:/(?:$blocklevel)|(?:$blocklevel)(?: [^>]*?)?)>);s"; |
192 |
192 |
193 // oh. and we're using this tokens thing because for identical matches, the first match will |
193 // oh. and we're using this tokens thing because for identical matches, the first match will |
194 // get wrapped X number of times instead of all matches getting wrapped once; replacing each |
194 // get wrapped X number of times instead of all matches getting wrapped once; replacing each |
195 // with a unique token id remedies this |
195 // with a unique token id remedies this |
196 |
196 |
197 $tokens = array(); |
197 $tokens = array(); |
198 $rand_id = sha1(microtime() . mt_rand()); |
198 $rand_id = sha1(microtime() . mt_rand()); |
199 $tag_stack = array(); |
199 $tag_stack = array(); |
200 |
200 |
201 if ( $text_split = preg_split($regex, $text, -1, PREG_SPLIT_DELIM_CAPTURE) ) |
201 if ( $text_split = preg_split($regex, $text, -1, PREG_SPLIT_DELIM_CAPTURE) ) |
202 { |
202 { |
203 $text = ''; |
203 $text = ''; |
204 // go through the text, extract tag names, and push them to a stack. |
204 // go through the text, extract tag names, and push them to a stack. |
205 foreach ( $text_split as $splitpart ) |
205 foreach ( $text_split as $splitpart ) |
206 { |
206 { |
207 if ( preg_match(";^<(/)?($blocklevel)( |>);i", $splitpart, $match) ) |
207 if ( preg_match(";^<(/)?($blocklevel)( |>);i", $splitpart, $match) ) |
208 { |
208 { |
209 $tagname = $match[2]; |
209 $tagname = $match[2]; |
210 if ( $match[1] == '/' ) |
210 if ( $match[1] == '/' ) |
211 { |
211 { |
212 // closing tag |
212 // closing tag |
213 if ( $tagname != ($top = array_pop($tag_stack)) ) |
213 if ( $tagname != ($top = array_pop($tag_stack)) ) |
214 { |
214 { |
215 // invalid - push back |
215 // invalid - push back |
216 array_push($tag_stack, $top); |
216 array_push($tag_stack, $top); |
217 } |
217 } |
218 else |
218 else |
219 { |
219 { |
220 // valid - if stack's at zero, add a </_paragraph_bypass> |
220 // valid - if stack's at zero, add a </_paragraph_bypass> |
221 if ( count($tag_stack) == 0 ) |
221 if ( count($tag_stack) == 0 ) |
222 $splitpart .= '</_paragraph_bypass>'; |
222 $splitpart .= '</_paragraph_bypass>'; |
223 } |
223 } |
224 } |
224 } |
225 else |
225 else |
226 { |
226 { |
227 // push |
227 // push |
228 array_push($tag_stack, $tagname); |
228 array_push($tag_stack, $tagname); |
229 if ( count($tag_stack) == 1 ) |
229 if ( count($tag_stack) == 1 ) |
230 $splitpart = '<_paragraph_bypass>' . $splitpart; |
230 $splitpart = '<_paragraph_bypass>' . $splitpart; |
231 } |
231 } |
232 } |
232 } |
233 $text .= $splitpart; |
233 $text .= $splitpart; |
234 } |
234 } |
235 //echo '<pre>' . htmlspecialchars(print_r($text, true)) . '</pre>'; |
235 //echo '<pre>' . htmlspecialchars(print_r($text, true)) . '</pre>'; |
236 } |
236 } |
237 |
237 |
238 // All things that should be para-bypassed now are surrounded by _paragraph_bypass tags. |
238 // All things that should be para-bypassed now are surrounded by _paragraph_bypass tags. |
239 |
239 |
240 // die('<pre>' . htmlspecialchars($text) . '</pre>'); |
240 // die('<pre>' . htmlspecialchars($text) . '</pre>'); |
241 |
241 |
242 RenderMan::tag_unstrip('_paragraph_bypass', $text, $_nw, true); |
242 RenderMan::tag_unstrip('_paragraph_bypass', $text, $_nw, true); |
243 |
243 |
244 // This is potentially a hack. It allows the parser to stick in <_paragraph_bypass> tags |
244 // This is potentially a hack. It allows the parser to stick in <_paragraph_bypass> tags |
245 // to prevent the paragraph parser from interfering with pretty HTML generated elsewhere. |
245 // to prevent the paragraph parser from interfering with pretty HTML generated elsewhere. |
246 RenderMan::tag_strip('_paragraph_bypass', $text, $_nw); |
246 RenderMan::tag_strip('_paragraph_bypass', $text, $_nw); |
247 |
247 |
248 $startcond = "(?!(?:[\\r\\n]|\{_paragraph_bypass:[a-f0-9]{32}:[0-9]+\}|[ ]*<\/?(?:$blocklevel)(?: .+>|>)))"; |
248 $startcond = "(?!(?:[\\r\\n]|\{_paragraph_bypass:[a-f0-9]{32}:[0-9]+\}|[ ]*<\/?(?:$blocklevel)(?: .+>|>)))"; |
249 $regex = "/^ |
249 $regex = "/^ |
250 $startcond # line start condition - do not match if the line starts with the condition above |
250 $startcond # line start condition - do not match if the line starts with the condition above |
251 .+? # body text |
251 .+? # body text |
252 (?: |
252 (?: |
253 \\n # additional lines |
253 \\n # additional lines |
254 $startcond # make sure of only one newline in a row, and end the paragraph if a new line fails the start condition |
254 $startcond # make sure of only one newline in a row, and end the paragraph if a new line fails the start condition |
255 .*? |
255 .*? |
256 )* # keep going until it fails |
256 )* # keep going until it fails |
257 $ |
257 $ |
258 /mx"; |
258 /mx"; |
259 |
259 |
260 if ( !preg_match_all($regex, $text, $matches) ) |
260 if ( !preg_match_all($regex, $text, $matches) ) |
261 { |
261 { |
262 RenderMan::tag_unstrip('_paragraph_bypass', $text, $_nw); |
262 RenderMan::tag_unstrip('_paragraph_bypass', $text, $_nw); |
263 return array(); |
263 return array(); |
264 } |
264 } |
265 |
265 |
266 // Debugging :) |
266 // Debugging :) |
267 // die('<pre>' . htmlspecialchars($text) . "\n-----------------------------------------------------------\n" . htmlspecialchars(print_r($matches, true)) . '</pre>'); |
267 // die('<pre>' . htmlspecialchars($text) . "\n-----------------------------------------------------------\n" . htmlspecialchars(print_r($matches, true)) . '</pre>'); |
268 |
268 |
269 // restore stripped |
269 // restore stripped |
270 RenderMan::tag_unstrip('_paragraph_bypass', $text, $_nw); |
270 RenderMan::tag_unstrip('_paragraph_bypass', $text, $_nw); |
271 |
271 |
272 // tokenize |
272 // tokenize |
273 $text = Carpenter::tokenize($text, $matches[0]); |
273 $text = Carpenter::tokenize($text, $matches[0]); |
274 |
274 |
275 return $matches[0]; |
275 return $matches[0]; |
276 } |
276 } |
277 } |
277 } |
278 |
278 |
279 function parser_mediawiki_xhtml_image($text) |
279 function parser_mediawiki_xhtml_image($text) |
280 { |
280 { |
281 $text = RenderMan::process_image_tags($text, $taglist); |
281 $text = RenderMan::process_image_tags($text, $taglist); |
282 $text = RenderMan::process_imgtags_stage2($text, $taglist); |
282 $text = RenderMan::process_imgtags_stage2($text, $taglist); |
283 return $text; |
283 return $text; |
284 } |
284 } |
285 |
285 |
286 function parser_mediawiki_xhtml_tables($text) |
286 function parser_mediawiki_xhtml_tables($text) |
287 { |
287 { |
288 return process_tables($text); |
288 return process_tables($text); |
289 } |
289 } |
290 |
290 |