|
1 <?php |
|
2 |
|
3 /** |
|
4 * Enano - an open-source CMS capable of wiki functions, Drupal-like sidebar blocks, and everything in between |
|
5 * Version 1.0 (Banshee) |
|
6 * Copyright (C) 2006-2007 Dan Fuhry |
|
7 * |
|
8 * This program is Free Software; you can redistribute and/or modify it under the terms of the GNU General Public License |
|
9 * as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. |
|
10 * |
|
11 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied |
|
12 * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for details. |
|
13 * |
|
14 * This script contains code originally found in MediaWiki (http://www.mediawiki.org). MediaWiki is also licensed under |
|
15 * the GPLv2; see the file GPL included with this package for details. |
|
16 * |
|
17 * We're using the MW parser because the Text_Wiki version simply refused to work under PHP 5.2.0. Porting this was |
|
18 * _not_ easy. <leaves to get cup of coffee> |
|
19 */ |
|
20 |
|
21 global $mStripState, $wgRandomKey; |
|
22 $mStripState = Array(); |
|
23 |
|
24 $attrib = '[a-zA-Z0-9]'; |
|
25 $space = '[\x09\x0a\x0d\x20]'; |
|
26 |
|
27 define( 'MW_CHAR_REFS_REGEX', |
|
28 '/&([A-Za-z0-9]+); |
|
29 |&\#([0-9]+); |
|
30 |&\#x([0-9A-Za-z]+); |
|
31 |&\#X([0-9A-Za-z]+); |
|
32 |(&)/x' ); |
|
33 |
|
34 define( 'MW_ATTRIBS_REGEX', |
|
35 "/(?:^|$space)($attrib+) |
|
36 ($space*=$space* |
|
37 (?: |
|
38 # The attribute value: quoted or alone |
|
39 ".'"'."([^<".'"'."]*)".'"'." |
|
40 | '([^<']*)' |
|
41 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+) |
|
42 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of |
|
43 # colors are specified like this. |
|
44 # We'll be normalizing it. |
|
45 ) |
|
46 )?(?=$space|\$)/sx" ); |
|
47 |
|
48 /** |
|
49 * emulate mediawiki parser, including stripping, etc. |
|
50 * |
|
51 * @param string $text the text to parse |
|
52 * @return string |
|
53 * @access public |
|
54 */ |
|
55 |
|
56 function process_tables( $text ) |
|
57 { |
|
58 // include some globals, do some parser stuff that would normally be done in the parent parser function |
|
59 global $mStripState; |
|
60 $x =& $mStripState; |
|
61 //$text = mwStrip( $text, $x ); |
|
62 |
|
63 // parse the text |
|
64 $text = doTableStuff($text); |
|
65 |
|
66 // Unstrip it |
|
67 // $text = unstrip( $text, $mStripState ); |
|
68 // $text = unstripNoWiki( $text, $mStripState ); |
|
69 //die('<pre>'.print_r($mStripState, true).'</pre>'); |
|
70 return $text; |
|
71 } |
|
72 |
|
73 /** |
|
74 * parse the wiki syntax used to render tables |
|
75 * |
|
76 * @param string $t the text to parse |
|
77 * @return string |
|
78 * @access private |
|
79 */ |
|
80 function doTableStuff( $t ) { |
|
81 |
|
82 $t = explode ( "\n" , $t ) ; |
|
83 $td = array () ; # Is currently a td tag open? |
|
84 $ltd = array () ; # Was it TD or TH? |
|
85 $tr = array () ; # Is currently a tr tag open? |
|
86 $ltr = array () ; # tr attributes |
|
87 $has_opened_tr = array(); # Did this table open a <tr> element? |
|
88 $indent_level = 0; # indent level of the table |
|
89 foreach ( $t AS $k => $x ) |
|
90 { |
|
91 $x = trim ( $x ) ; |
|
92 $fc = substr ( $x , 0 , 1 ) ; |
|
93 if ( preg_match( '/^(:*)\{\|(.*)$/', $x, $matches ) ) { |
|
94 $indent_level = strlen( $matches[1] ); |
|
95 |
|
96 $attributes = unstripForHTML( $matches[2] ); |
|
97 |
|
98 $t[$k] = str_repeat( '<dl><dd>', $indent_level ) . |
|
99 '<nowiki><table' . fixTagAttributes( $attributes, 'table' ) . '></nowiki>' ; |
|
100 array_push ( $td , false ) ; |
|
101 array_push ( $ltd , '' ) ; |
|
102 array_push ( $tr , false ) ; |
|
103 array_push ( $ltr , '' ) ; |
|
104 array_push ( $has_opened_tr, false ); |
|
105 } |
|
106 else if ( count ( $td ) == 0 ) { } # Don't do any of the following |
|
107 else if ( '|}' == substr ( $x , 0 , 2 ) ) { |
|
108 $z = "<nowiki></table></nowiki>" . substr ( $x , 2); |
|
109 $l = array_pop ( $ltd ) ; |
|
110 if ( !array_pop ( $has_opened_tr ) ) $z = "<nowiki><tr><td></td></tr></nowiki>" . $z ; |
|
111 if ( array_pop ( $tr ) ) $z = '<nowiki></tr></nowiki>' . $z ; |
|
112 if ( array_pop ( $td ) ) $z = '<nowiki></'.$l.'></nowiki>' . $z ; |
|
113 array_pop ( $ltr ) ; |
|
114 $t[$k] = $z . str_repeat( '<nowiki></dd></dl></nowiki>', $indent_level ); |
|
115 } |
|
116 else if ( '|-' == substr ( $x , 0 , 2 ) ) { # Allows for |--------------- |
|
117 $x = substr ( $x , 1 ) ; |
|
118 while ( $x != '' && substr ( $x , 0 , 1 ) == '-' ) $x = substr ( $x , 1 ) ; |
|
119 $z = '' ; |
|
120 $l = array_pop ( $ltd ) ; |
|
121 array_pop ( $has_opened_tr ); |
|
122 array_push ( $has_opened_tr , true ) ; |
|
123 if ( array_pop ( $tr ) ) $z = '<nowiki></tr></nowiki>' . $z ; |
|
124 if ( array_pop ( $td ) ) $z = '<nowiki></'.$l.'></nowiki>' . $z ; |
|
125 array_pop ( $ltr ) ; |
|
126 $t[$k] = $z ; |
|
127 array_push ( $tr , false ) ; |
|
128 array_push ( $td , false ) ; |
|
129 array_push ( $ltd , '' ) ; |
|
130 $attributes = unstripForHTML( $x ); |
|
131 array_push ( $ltr , fixTagAttributes( $attributes, 'tr' ) ) ; |
|
132 } |
|
133 else if ( '|' == $fc || '!' == $fc || '|+' == substr ( $x , 0 , 2 ) ) { # Caption |
|
134 # $x is a table row |
|
135 if ( '|+' == substr ( $x , 0 , 2 ) ) { |
|
136 $fc = '+' ; |
|
137 $x = substr ( $x , 1 ) ; |
|
138 } |
|
139 $after = substr ( $x , 1 ) ; |
|
140 if ( $fc == '!' ) $after = str_replace ( '!!' , '||' , $after ) ; |
|
141 |
|
142 // Split up multiple cells on the same line. |
|
143 // FIXME: This can result in improper nesting of tags processed |
|
144 // by earlier parser steps, but should avoid splitting up eg |
|
145 // attribute values containing literal "||". |
|
146 $after = wfExplodeMarkup( '||', $after ); |
|
147 |
|
148 $t[$k] = '' ; |
|
149 |
|
150 # Loop through each table cell |
|
151 foreach ( $after AS $theline ) |
|
152 { |
|
153 $z = '' ; |
|
154 if ( $fc != '+' ) |
|
155 { |
|
156 $tra = array_pop ( $ltr ) ; |
|
157 if ( !array_pop ( $tr ) ) $z = '<nowiki><tr'.$tra."></nowiki>\n" ; |
|
158 array_push ( $tr , true ) ; |
|
159 array_push ( $ltr , '' ) ; |
|
160 array_pop ( $has_opened_tr ); |
|
161 array_push ( $has_opened_tr , true ) ; |
|
162 } |
|
163 |
|
164 $l = array_pop ( $ltd ) ; |
|
165 if ( array_pop ( $td ) ) $z = '<nowiki></'.$l.'></nowiki>' . $z ; |
|
166 if ( $fc == '|' ) $l = 'td' ; |
|
167 else if ( $fc == '!' ) $l = 'th' ; |
|
168 else if ( $fc == '+' ) $l = 'caption' ; |
|
169 else $l = '' ; |
|
170 array_push ( $ltd , $l ) ; |
|
171 |
|
172 # Cell parameters |
|
173 $y = explode ( '|' , $theline , 2 ) ; |
|
174 # Note that a '|' inside an invalid link should not |
|
175 # be mistaken as delimiting cell parameters |
|
176 if ( strpos( $y[0], '[[' ) !== false ) { |
|
177 $y = array ($theline); |
|
178 } |
|
179 if ( count ( $y ) == 1 ) |
|
180 $y = "{$z}<nowiki><{$l}></nowiki>{$y[0]}" ; |
|
181 else { |
|
182 $attributes = unstripForHTML( $y[0] ); |
|
183 $y = "{$z}<nowiki><{$l}".fixTagAttributes($attributes, $l)."></nowiki>{$y[1]}" ; |
|
184 } |
|
185 $t[$k] .= $y ; |
|
186 array_push ( $td , true ) ; |
|
187 } |
|
188 } |
|
189 } |
|
190 |
|
191 # Closing open td, tr && table |
|
192 while ( count ( $td ) > 0 ) |
|
193 { |
|
194 $l = array_pop ( $ltd ) ; |
|
195 if ( array_pop ( $td ) ) $t[] = '<nowiki></td></nowiki>' ; |
|
196 if ( array_pop ( $tr ) ) $t[] = '<nowiki></tr></nowiki>' ; |
|
197 if ( !array_pop ( $has_opened_tr ) ) $t[] = "<nowiki><tr><td></td></tr></nowiki>" ; |
|
198 $t[] = '<nowiki></table></nowiki>' ; |
|
199 } |
|
200 |
|
201 $t = implode ( "\n" , $t ) ; |
|
202 |
|
203 # special case: don't return empty table |
|
204 if($t == "<nowiki><table></nowiki>\n<nowiki><tr><td></td></tr></nowiki>\n<nowiki></table></nowiki>") |
|
205 $t = ''; |
|
206 return $t ; |
|
207 } |
|
208 |
|
209 /** |
|
210 * Take a tag soup fragment listing an HTML element's attributes |
|
211 * and normalize it to well-formed XML, discarding unwanted attributes. |
|
212 * Output is safe for further wikitext processing, with escaping of |
|
213 * values that could trigger problems. |
|
214 * |
|
215 * - Normalizes attribute names to lowercase |
|
216 * - Discards attributes not on a whitelist for the given element |
|
217 * - Turns broken or invalid entities into plaintext |
|
218 * - Double-quotes all attribute values |
|
219 * - Attributes without values are given the name as attribute |
|
220 * - Double attributes are discarded |
|
221 * - Unsafe style attributes are discarded |
|
222 * - Prepends space if there are attributes. |
|
223 * |
|
224 * @param string $text |
|
225 * @param string $element |
|
226 * @return string |
|
227 */ |
|
228 function fixTagAttributes( $text, $element ) { |
|
229 if( trim( $text ) == '' ) { |
|
230 return ''; |
|
231 } |
|
232 |
|
233 $stripped = validateTagAttributes( |
|
234 decodeTagAttributes( $text ), $element ); |
|
235 |
|
236 $attribs = array(); |
|
237 foreach( $stripped as $attribute => $value ) { |
|
238 $encAttribute = htmlspecialchars( $attribute ); |
|
239 $encValue = safeEncodeAttribute( $value ); |
|
240 |
|
241 $attribs[] = "$encAttribute=".'"'."$encValue".'"'.""; // " |
|
242 } |
|
243 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : ''; |
|
244 } |
|
245 |
|
246 /** |
|
247 * Encode an attribute value for HTML tags, with extra armoring |
|
248 * against further wiki processing. |
|
249 * @param $text |
|
250 * @return HTML-encoded text fragment |
|
251 */ |
|
252 function safeEncodeAttribute( $text ) { |
|
253 $encValue= encodeAttribute( $text ); |
|
254 |
|
255 # Templates and links may be expanded in later parsing, |
|
256 # creating invalid or dangerous output. Suppress this. |
|
257 $encValue = strtr( $encValue, array( |
|
258 '<' => '<', // This should never happen, |
|
259 '>' => '>', // we've received invalid input |
|
260 '"' => '"', // which should have been escaped. |
|
261 '{' => '{', |
|
262 '[' => '[', |
|
263 "''" => '''', |
|
264 'ISBN' => 'ISBN', |
|
265 'RFC' => 'RFC', |
|
266 'PMID' => 'PMID', |
|
267 '|' => '|', |
|
268 '__' => '__', |
|
269 ) ); |
|
270 |
|
271 return $encValue; |
|
272 } |
|
273 |
|
274 /** |
|
275 * Encode an attribute value for HTML output. |
|
276 * @param $text |
|
277 * @return HTML-encoded text fragment |
|
278 */ |
|
279 function encodeAttribute( $text ) { |
|
280 $encValue = htmlspecialchars( $text ); |
|
281 |
|
282 // Whitespace is normalized during attribute decoding, |
|
283 // so if we've been passed non-spaces we must encode them |
|
284 // ahead of time or they won't be preserved. |
|
285 $encValue = strtr( $encValue, array( |
|
286 "\n" => ' ', |
|
287 "\r" => ' ', |
|
288 "\t" => '	', |
|
289 ) ); |
|
290 |
|
291 return $encValue; |
|
292 } |
|
293 |
|
294 function unstripForHTML( $text ) { |
|
295 global $mStripState; |
|
296 $text = unstrip( $text, $mStripState ); |
|
297 $text = unstripNoWiki( $text, $mStripState ); |
|
298 return $text; |
|
299 } |
|
300 |
|
301 /** |
|
302 * Always call this after unstrip() to preserve the order |
|
303 * |
|
304 * @private |
|
305 */ |
|
306 function unstripNoWiki( $text, &$state ) { |
|
307 if ( !isset( $state['nowiki'] ) ) { |
|
308 return $text; |
|
309 } |
|
310 |
|
311 # TODO: good candidate for FSS |
|
312 $text = strtr( $text, $state['nowiki'] ); |
|
313 |
|
314 return $text; |
|
315 } |
|
316 |
|
317 /** |
|
318 * Take an array of attribute names and values and normalize or discard |
|
319 * illegal values for the given element type. |
|
320 * |
|
321 * - Discards attributes not on a whitelist for the given element |
|
322 * - Unsafe style attributes are discarded |
|
323 * |
|
324 * @param array $attribs |
|
325 * @param string $element |
|
326 * @return array |
|
327 * |
|
328 * @todo Check for legal values where the DTD limits things. |
|
329 * @todo Check for unique id attribute :P |
|
330 */ |
|
331 function validateTagAttributes( $attribs, $element ) { |
|
332 $whitelist = array_flip( attributeWhitelist( $element ) ); |
|
333 $out = array(); |
|
334 foreach( $attribs as $attribute => $value ) { |
|
335 if( !isset( $whitelist[$attribute] ) ) { |
|
336 continue; |
|
337 } |
|
338 # Strip javascript "expression" from stylesheets. |
|
339 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp |
|
340 if( $attribute == 'style' ) { |
|
341 $value = checkCss( $value ); |
|
342 if( $value === false ) { |
|
343 # haxx0r |
|
344 continue; |
|
345 } |
|
346 } |
|
347 |
|
348 if ( $attribute === 'id' ) |
|
349 $value = escapeId( $value ); |
|
350 |
|
351 // If this attribute was previously set, override it. |
|
352 // Output should only have one attribute of each name. |
|
353 $out[$attribute] = $value; |
|
354 } |
|
355 return $out; |
|
356 } |
|
357 |
|
358 /** |
|
359 * Pick apart some CSS and check it for forbidden or unsafe structures. |
|
360 * Returns a sanitized string, or false if it was just too evil. |
|
361 * |
|
362 * Currently URL references, 'expression', 'tps' are forbidden. |
|
363 * |
|
364 * @param string $value |
|
365 * @return mixed |
|
366 */ |
|
367 function checkCss( $value ) { |
|
368 $stripped = decodeCharReferences( $value ); |
|
369 |
|
370 // Remove any comments; IE gets token splitting wrong |
|
371 $stripped = preg_replace( '!/\\*.*?\\*/!S', '', $stripped ); |
|
372 $value = $stripped; |
|
373 |
|
374 // ... and continue checks |
|
375 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e', |
|
376 'codepointToUtf8(hexdec("$1"))', $stripped ); |
|
377 $stripped = str_replace( '\\', '', $stripped ); |
|
378 if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is', |
|
379 $stripped ) ) { |
|
380 # haxx0r |
|
381 return false; |
|
382 } |
|
383 |
|
384 return $value; |
|
385 } |
|
386 |
|
387 /** |
|
388 * Decode any character references, numeric or named entities, |
|
389 * in the text and return a UTF-8 string. |
|
390 * |
|
391 * @param string $text |
|
392 * @return string |
|
393 * @access public |
|
394 * @static |
|
395 */ |
|
396 function decodeCharReferences( $text ) { |
|
397 return preg_replace_callback( |
|
398 MW_CHAR_REFS_REGEX, |
|
399 'decodeCharReferencesCallback', |
|
400 $text ); |
|
401 } |
|
402 |
|
403 /** |
|
404 * Fetch the whitelist of acceptable attributes for a given |
|
405 * element name. |
|
406 * |
|
407 * @param string $element |
|
408 * @return array |
|
409 */ |
|
410 function attributeWhitelist( $element ) { |
|
411 static $list; |
|
412 if( !isset( $list ) ) { |
|
413 $list = setupAttributeWhitelist(); |
|
414 } |
|
415 return isset( $list[$element] ) |
|
416 ? $list[$element] |
|
417 : array(); |
|
418 } |
|
419 |
|
420 /** |
|
421 * @todo Document it a bit |
|
422 * @return array |
|
423 */ |
|
424 function setupAttributeWhitelist() { |
|
425 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' ); |
|
426 $block = array_merge( $common, array( 'align' ) ); |
|
427 $tablealign = array( 'align', 'char', 'charoff', 'valign' ); |
|
428 $tablecell = array( 'abbr', |
|
429 'axis', |
|
430 'headers', |
|
431 'scope', |
|
432 'rowspan', |
|
433 'colspan', |
|
434 'nowrap', # deprecated |
|
435 'width', # deprecated |
|
436 'height', # deprecated |
|
437 'bgcolor' # deprecated |
|
438 ); |
|
439 |
|
440 # Numbers refer to sections in HTML 4.01 standard describing the element. |
|
441 # See: http://www.w3.org/TR/html4/ |
|
442 $whitelist = array ( |
|
443 # 7.5.4 |
|
444 'div' => $block, |
|
445 'center' => $common, # deprecated |
|
446 'span' => $block, # ?? |
|
447 |
|
448 # 7.5.5 |
|
449 'h1' => $block, |
|
450 'h2' => $block, |
|
451 'h3' => $block, |
|
452 'h4' => $block, |
|
453 'h5' => $block, |
|
454 'h6' => $block, |
|
455 |
|
456 # 7.5.6 |
|
457 # address |
|
458 |
|
459 # 8.2.4 |
|
460 # bdo |
|
461 |
|
462 # 9.2.1 |
|
463 'em' => $common, |
|
464 'strong' => $common, |
|
465 'cite' => $common, |
|
466 # dfn |
|
467 'code' => $common, |
|
468 # samp |
|
469 # kbd |
|
470 'var' => $common, |
|
471 # abbr |
|
472 # acronym |
|
473 |
|
474 # 9.2.2 |
|
475 'blockquote' => array_merge( $common, array( 'cite' ) ), |
|
476 # q |
|
477 |
|
478 # 9.2.3 |
|
479 'sub' => $common, |
|
480 'sup' => $common, |
|
481 |
|
482 # 9.3.1 |
|
483 'p' => $block, |
|
484 |
|
485 # 9.3.2 |
|
486 'br' => array( 'id', 'class', 'title', 'style', 'clear' ), |
|
487 |
|
488 # 9.3.4 |
|
489 'pre' => array_merge( $common, array( 'width' ) ), |
|
490 |
|
491 # 9.4 |
|
492 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ), |
|
493 'del' => array_merge( $common, array( 'cite', 'datetime' ) ), |
|
494 |
|
495 # 10.2 |
|
496 'ul' => array_merge( $common, array( 'type' ) ), |
|
497 'ol' => array_merge( $common, array( 'type', 'start' ) ), |
|
498 'li' => array_merge( $common, array( 'type', 'value' ) ), |
|
499 |
|
500 # 10.3 |
|
501 'dl' => $common, |
|
502 'dd' => $common, |
|
503 'dt' => $common, |
|
504 |
|
505 # 11.2.1 |
|
506 'table' => array_merge( $common, |
|
507 array( 'summary', 'width', 'border', 'frame', |
|
508 'rules', 'cellspacing', 'cellpadding', |
|
509 'align', 'bgcolor', |
|
510 ) ), |
|
511 |
|
512 # 11.2.2 |
|
513 'caption' => array_merge( $common, array( 'align' ) ), |
|
514 |
|
515 # 11.2.3 |
|
516 'thead' => array_merge( $common, $tablealign ), |
|
517 'tfoot' => array_merge( $common, $tablealign ), |
|
518 'tbody' => array_merge( $common, $tablealign ), |
|
519 |
|
520 # 11.2.4 |
|
521 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ), |
|
522 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ), |
|
523 |
|
524 # 11.2.5 |
|
525 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ), |
|
526 |
|
527 # 11.2.6 |
|
528 'td' => array_merge( $common, $tablecell, $tablealign ), |
|
529 'th' => array_merge( $common, $tablecell, $tablealign ), |
|
530 |
|
531 # 12.2 |
|
532 # added by dan |
|
533 'a' => array_merge( $common, array( 'href', 'name' ) ), |
|
534 |
|
535 # 13.2 |
|
536 # added by dan |
|
537 'img' => array_merge( $common, array( 'src', 'width', 'height', 'alt' ) ), |
|
538 |
|
539 # 15.2.1 |
|
540 'tt' => $common, |
|
541 'b' => $common, |
|
542 'i' => $common, |
|
543 'big' => $common, |
|
544 'small' => $common, |
|
545 'strike' => $common, |
|
546 's' => $common, |
|
547 'u' => $common, |
|
548 |
|
549 # 15.2.2 |
|
550 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ), |
|
551 # basefont |
|
552 |
|
553 # 15.3 |
|
554 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ), |
|
555 |
|
556 # XHTML Ruby annotation text module, simple ruby only. |
|
557 # http://www.w3c.org/TR/ruby/ |
|
558 'ruby' => $common, |
|
559 # rbc |
|
560 # rtc |
|
561 'rb' => $common, |
|
562 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ), |
|
563 'rp' => $common, |
|
564 |
|
565 # For compatibility with the XHTML parser. |
|
566 'nowiki' => array(), |
|
567 'noinclude' => array(), |
|
568 'nodisplay' => array(), |
|
569 |
|
570 # XHTML stuff |
|
571 'acronym' => $common |
|
572 ); |
|
573 return $whitelist; |
|
574 } |
|
575 |
|
576 /** |
|
577 * Given a value escape it so that it can be used in an id attribute and |
|
578 * return it, this does not validate the value however (see first link) |
|
579 * |
|
580 * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters |
|
581 * in the id and |
|
582 * name attributes |
|
583 * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute |
|
584 * |
|
585 * @bug 4461 |
|
586 * |
|
587 * @static |
|
588 * |
|
589 * @param string $id |
|
590 * @return string |
|
591 */ |
|
592 function escapeId( $id ) { |
|
593 static $replace = array( |
|
594 '%3A' => ':', |
|
595 '%' => '.' |
|
596 ); |
|
597 |
|
598 $id = urlencode( decodeCharReferences( strtr( $id, ' ', '_' ) ) ); |
|
599 |
|
600 return str_replace( array_keys( $replace ), array_values( $replace ), $id ); |
|
601 } |
|
602 |
|
603 /** |
|
604 * More or less "markup-safe" explode() |
|
605 * Ignores any instances of the separator inside <...> |
|
606 * @param string $separator |
|
607 * @param string $text |
|
608 * @return array |
|
609 */ |
|
610 function wfExplodeMarkup( $separator, $text ) { |
|
611 $placeholder = "\x00"; |
|
612 |
|
613 // Just in case... |
|
614 $text = str_replace( $placeholder, '', $text ); |
|
615 |
|
616 // Trim stuff |
|
617 $replacer = new ReplacerCallback( $separator, $placeholder ); |
|
618 $cleaned = preg_replace_callback( '/(<.*?>)/', array( $replacer, 'go' ), $text ); |
|
619 |
|
620 $items = explode( $separator, $cleaned ); |
|
621 foreach( $items as $i => $str ) { |
|
622 $items[$i] = str_replace( $placeholder, $separator, $str ); |
|
623 } |
|
624 |
|
625 return $items; |
|
626 } |
|
627 |
|
628 class ReplacerCallback { |
|
629 function ReplacerCallback( $from, $to ) { |
|
630 $this->from = $from; |
|
631 $this->to = $to; |
|
632 } |
|
633 |
|
634 function go( $matches ) { |
|
635 return str_replace( $this->from, $this->to, $matches[1] ); |
|
636 } |
|
637 } |
|
638 |
|
639 /** |
|
640 * Return an associative array of attribute names and values from |
|
641 * a partial tag string. Attribute names are forces to lowercase, |
|
642 * character references are decoded to UTF-8 text. |
|
643 * |
|
644 * @param string |
|
645 * @return array |
|
646 */ |
|
647 function decodeTagAttributes( $text ) { |
|
648 $attribs = array(); |
|
649 |
|
650 if( trim( $text ) == '' ) { |
|
651 return $attribs; |
|
652 } |
|
653 |
|
654 $pairs = array(); |
|
655 if( !preg_match_all( |
|
656 MW_ATTRIBS_REGEX, |
|
657 $text, |
|
658 $pairs, |
|
659 PREG_SET_ORDER ) ) { |
|
660 return $attribs; |
|
661 } |
|
662 |
|
663 foreach( $pairs as $set ) { |
|
664 $attribute = strtolower( $set[1] ); |
|
665 $value = getTagAttributeCallback( $set ); |
|
666 |
|
667 // Normalize whitespace |
|
668 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value ); |
|
669 $value = trim( $value ); |
|
670 |
|
671 // Decode character references |
|
672 $attribs[$attribute] = decodeCharReferences( $value ); |
|
673 } |
|
674 return $attribs; |
|
675 } |
|
676 |
|
677 /** |
|
678 * Pick the appropriate attribute value from a match set from the |
|
679 * MW_ATTRIBS_REGEX matches. |
|
680 * |
|
681 * @param array $set |
|
682 * @return string |
|
683 * @access private |
|
684 */ |
|
685 function getTagAttributeCallback( $set ) { |
|
686 if( isset( $set[6] ) ) { |
|
687 # Illegal #XXXXXX color with no quotes. |
|
688 return $set[6]; |
|
689 } elseif( isset( $set[5] ) ) { |
|
690 # No quotes. |
|
691 return $set[5]; |
|
692 } elseif( isset( $set[4] ) ) { |
|
693 # Single-quoted |
|
694 return $set[4]; |
|
695 } elseif( isset( $set[3] ) ) { |
|
696 # Double-quoted |
|
697 return $set[3]; |
|
698 } elseif( !isset( $set[2] ) ) { |
|
699 # In XHTML, attributes must have a value. |
|
700 # For 'reduced' form, return explicitly the attribute name here. |
|
701 return $set[1]; |
|
702 } else { |
|
703 die_friendly('Parser error', "<p>Tag conditions not met. This should never happen and is a bug.</p>" ); |
|
704 } |
|
705 } |
|
706 |
|
707 /** |
|
708 * Strips and renders nowiki, pre, math, hiero |
|
709 * If $render is set, performs necessary rendering operations on plugins |
|
710 * Returns the text, and fills an array with data needed in unstrip() |
|
711 * If the $state is already a valid strip state, it adds to the state |
|
712 * |
|
713 * @param bool $stripcomments when set, HTML comments <!-- like this --> |
|
714 * will be stripped in addition to other tags. This is important |
|
715 * for section editing, where these comments cause confusion when |
|
716 * counting the sections in the wikisource |
|
717 * |
|
718 * @param array dontstrip contains tags which should not be stripped; |
|
719 * used to prevent stipping of <gallery> when saving (fixes bug 2700) |
|
720 * |
|
721 * @access private |
|
722 */ |
|
723 function mwStrip( $text, &$state, $stripcomments = false , $dontstrip = array () ) { |
|
724 global $wgRandomKey; |
|
725 $render = true; |
|
726 |
|
727 $wgRandomKey = "\x07UNIQ" . dechex(mt_rand(0, 0x7fffffff)) . dechex(mt_rand(0, 0x7fffffff)); |
|
728 $uniq_prefix =& $wgRandomKey; |
|
729 $commentState = array(); |
|
730 |
|
731 $elements = array( 'nowiki', 'gallery' ); |
|
732 |
|
733 # Removing $dontstrip tags from $elements list (currently only 'gallery', fixing bug 2700) |
|
734 foreach ( $elements AS $k => $v ) { |
|
735 if ( !in_array ( $v , $dontstrip ) ) continue; |
|
736 unset ( $elements[$k] ); |
|
737 } |
|
738 |
|
739 $matches = array(); |
|
740 $text = extractTagsAndParams( $elements, $text, $matches, $uniq_prefix ); |
|
741 |
|
742 foreach( $matches as $marker => $data ) { |
|
743 list( $element, $content, $params, $tag ) = $data; |
|
744 if( $render ) { |
|
745 $tagName = strtolower( $element ); |
|
746 switch( $tagName ) { |
|
747 case '!--': |
|
748 // Comment |
|
749 if( substr( $tag, -3 ) == '-->' ) { |
|
750 $output = $tag; |
|
751 } else { |
|
752 // Unclosed comment in input. |
|
753 // Close it so later stripping can remove it |
|
754 $output = "$tag-->"; |
|
755 } |
|
756 break; |
|
757 case 'html': |
|
758 if( $wgRawHtml ) { |
|
759 $output = $content; |
|
760 break; |
|
761 } |
|
762 // Shouldn't happen otherwise. :) |
|
763 case 'nowiki': |
|
764 $output = wfEscapeHTMLTagsOnly( $content ); |
|
765 break; |
|
766 default: |
|
767 } |
|
768 } else { |
|
769 // Just stripping tags; keep the source |
|
770 $output = $tag; |
|
771 } |
|
772 |
|
773 // Unstrip the output, because unstrip() is no longer recursive so |
|
774 // it won't do it itself |
|
775 $output = unstrip( $output, $state ); |
|
776 |
|
777 if( !$stripcomments && $element == '!--' ) { |
|
778 $commentState[$marker] = $output; |
|
779 } elseif ( $element == 'html' || $element == 'nowiki' ) { |
|
780 $state['nowiki'][$marker] = $output; |
|
781 } else { |
|
782 $state['general'][$marker] = $output; |
|
783 } |
|
784 } |
|
785 |
|
786 # Unstrip comments unless explicitly told otherwise. |
|
787 # (The comments are always stripped prior to this point, so as to |
|
788 # not invoke any extension tags / parser hooks contained within |
|
789 # a comment.) |
|
790 if ( !$stripcomments ) { |
|
791 // Put them all back and forget them |
|
792 $text = strtr( $text, $commentState ); |
|
793 } |
|
794 |
|
795 return $text; |
|
796 } |
|
797 |
|
798 /** |
|
799 * Replaces all occurrences of HTML-style comments and the given tags |
|
800 * in the text with a random marker and returns teh next text. The output |
|
801 * parameter $matches will be an associative array filled with data in |
|
802 * the form: |
|
803 * 'UNIQ-xxxxx' => array( |
|
804 * 'element', |
|
805 * 'tag content', |
|
806 * array( 'param' => 'x' ), |
|
807 * '<element param="x">tag content</element>' ) ) |
|
808 * |
|
809 * @param $elements list of element names. Comments are always extracted. |
|
810 * @param $text Source text string. |
|
811 * @param $uniq_prefix |
|
812 * |
|
813 * @access private |
|
814 * @static |
|
815 */ |
|
816 function extractTagsAndParams($elements, $text, &$matches, $uniq_prefix = ''){ |
|
817 static $n = 1; |
|
818 $stripped = ''; |
|
819 $matches = array(); |
|
820 |
|
821 $taglist = implode( '|', $elements ); |
|
822 $start = "/<($taglist)(\\s+[^>]*?|\\s*?)(\/?>)|<(!--)/i"; |
|
823 |
|
824 while ( '' != $text ) { |
|
825 $p = preg_split( $start, $text, 2, PREG_SPLIT_DELIM_CAPTURE ); |
|
826 $stripped .= $p[0]; |
|
827 if( count( $p ) < 5 ) { |
|
828 break; |
|
829 } |
|
830 if( count( $p ) > 5 ) { |
|
831 // comment |
|
832 $element = $p[4]; |
|
833 $attributes = ''; |
|
834 $close = ''; |
|
835 $inside = $p[5]; |
|
836 } else { |
|
837 // tag |
|
838 $element = $p[1]; |
|
839 $attributes = $p[2]; |
|
840 $close = $p[3]; |
|
841 $inside = $p[4]; |
|
842 } |
|
843 |
|
844 $marker = "$uniq_prefix-$element-" . sprintf('%08X', $n++) . '-QINU'; |
|
845 $stripped .= $marker; |
|
846 |
|
847 if ( $close === '/>' ) { |
|
848 // Empty element tag, <tag /> |
|
849 $content = null; |
|
850 $text = $inside; |
|
851 $tail = null; |
|
852 } else { |
|
853 if( $element == '!--' ) { |
|
854 $end = '/(-->)/'; |
|
855 } else { |
|
856 $end = "/(<\\/$element\\s*>)/i"; |
|
857 } |
|
858 $q = preg_split( $end, $inside, 2, PREG_SPLIT_DELIM_CAPTURE ); |
|
859 $content = $q[0]; |
|
860 if( count( $q ) < 3 ) { |
|
861 # No end tag -- let it run out to the end of the text. |
|
862 $tail = ''; |
|
863 $text = ''; |
|
864 } else { |
|
865 $tail = $q[1]; |
|
866 $text = $q[2]; |
|
867 } |
|
868 } |
|
869 |
|
870 $matches[$marker] = array( $element, |
|
871 $content, |
|
872 decodeTagAttributes( $attributes ), |
|
873 "<$element$attributes$close$content$tail" ); |
|
874 } |
|
875 return $stripped; |
|
876 } |
|
877 |
|
878 /** |
|
879 * Escape html tags |
|
880 * Basically replacing " > and < with HTML entities ( ", >, <) |
|
881 * |
|
882 * @param $in String: text that might contain HTML tags. |
|
883 * @return string Escaped string |
|
884 */ |
|
885 function wfEscapeHTMLTagsOnly( $in ) { |
|
886 return str_replace( |
|
887 array( '"', '>', '<' ), |
|
888 array( '"', '>', '<' ), |
|
889 $in ); |
|
890 } |
|
891 |
|
892 /** |
|
893 * Restores pre, math, and other extensions removed by strip() |
|
894 * |
|
895 * always call unstripNoWiki() after this one |
|
896 * @private |
|
897 */ |
|
898 function unstrip( $text, &$state ) { |
|
899 if ( !isset( $state['general'] ) ) { |
|
900 return $text; |
|
901 } |
|
902 |
|
903 # TODO: good candidate for FSS |
|
904 $text = strtr( $text, $state['general'] ); |
|
905 |
|
906 return $text; |
|
907 } |
|
908 |
|
909 /** |
|
910 * Return UTF-8 string for a codepoint if that is a valid |
|
911 * character reference, otherwise U+FFFD REPLACEMENT CHARACTER. |
|
912 * @param int $codepoint |
|
913 * @return string |
|
914 * @private |
|
915 */ |
|
916 function decodeChar( $codepoint ) { |
|
917 if( validateCodepoint( $codepoint ) ) { |
|
918 return codepointToUtf8( $codepoint ); |
|
919 } else { |
|
920 return UTF8_REPLACEMENT; |
|
921 } |
|
922 } |
|
923 |
|
924 /** |
|
925 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, |
|
926 * return the UTF-8 encoding of that character. Otherwise, returns |
|
927 * pseudo-entity source (eg &foo;) |
|
928 * |
|
929 * @param string $name |
|
930 * @return string |
|
931 */ |
|
932 function decodeEntity( $name ) { |
|
933 global $wgHtmlEntities; |
|
934 if( isset( $wgHtmlEntities[$name] ) ) { |
|
935 return codepointToUtf8( $wgHtmlEntities[$name] ); |
|
936 } else { |
|
937 return "&$name;"; |
|
938 } |
|
939 } |
|
940 |
|
941 /** |
|
942 * Returns true if a given Unicode codepoint is a valid character in XML. |
|
943 * @param int $codepoint |
|
944 * @return bool |
|
945 */ |
|
946 function validateCodepoint( $codepoint ) { |
|
947 return ($codepoint == 0x09) |
|
948 || ($codepoint == 0x0a) |
|
949 || ($codepoint == 0x0d) |
|
950 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff) |
|
951 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd) |
|
952 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff); |
|
953 } |
|
954 |
|
955 /** |
|
956 * Return UTF-8 sequence for a given Unicode code point. |
|
957 * May die if fed out of range data. |
|
958 * |
|
959 * @param $codepoint Integer: |
|
960 * @return String |
|
961 * @public |
|
962 */ |
|
963 function codepointToUtf8( $codepoint ) { |
|
964 if($codepoint < 0x80) return chr($codepoint); |
|
965 if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f | 0xc0) . |
|
966 chr($codepoint & 0x3f | 0x80); |
|
967 if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) . |
|
968 chr($codepoint >> 6 & 0x3f | 0x80) . |
|
969 chr($codepoint & 0x3f | 0x80); |
|
970 if($codepoint < 0x110000) return chr($codepoint >> 18 & 0x07 | 0xf0) . |
|
971 chr($codepoint >> 12 & 0x3f | 0x80) . |
|
972 chr($codepoint >> 6 & 0x3f | 0x80) . |
|
973 chr($codepoint & 0x3f | 0x80); |
|
974 |
|
975 echo "Asked for code outside of range ($codepoint)\n"; |
|
976 die( -1 ); |
|
977 } |
|
978 |
|
979 /** |
|
980 * @param string $matches |
|
981 * @return string |
|
982 */ |
|
983 function decodeCharReferencesCallback( $matches ) { |
|
984 if( $matches[1] != '' ) { |
|
985 return Sanitizer::decodeEntity( $matches[1] ); |
|
986 } elseif( $matches[2] != '' ) { |
|
987 return Sanitizer::decodeChar( intval( $matches[2] ) ); |
|
988 } elseif( $matches[3] != '' ) { |
|
989 return Sanitizer::decodeChar( hexdec( $matches[3] ) ); |
|
990 } elseif( $matches[4] != '' ) { |
|
991 return Sanitizer::decodeChar( hexdec( $matches[4] ) ); |
|
992 } |
|
993 # Last case should be an ampersand by itself |
|
994 return $matches[0]; |
|
995 } |
|
996 |
|
997 ?> |