1
|
1 |
<?php
|
|
2 |
|
|
3 |
/**
|
|
4 |
* Enano - an open-source CMS capable of wiki functions, Drupal-like sidebar blocks, and everything in between
|
|
5 |
* Version 1.0 (Banshee)
|
|
6 |
* Copyright (C) 2006-2007 Dan Fuhry
|
|
7 |
*
|
|
8 |
* This program is Free Software; you can redistribute and/or modify it under the terms of the GNU General Public License
|
|
9 |
* as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.
|
|
10 |
*
|
|
11 |
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied
|
|
12 |
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for details.
|
|
13 |
*
|
|
14 |
* This script contains code originally found in MediaWiki (http://www.mediawiki.org). MediaWiki is also licensed under
|
|
15 |
* the GPLv2; see the file GPL included with this package for details.
|
|
16 |
*
|
|
17 |
* We're using the MW parser because the Text_Wiki version simply refused to work under PHP 5.2.0. Porting this was
|
|
18 |
* _not_ easy. <leaves to get cup of coffee>
|
|
19 |
*/
|
|
20 |
|
|
21 |
global $mStripState, $wgRandomKey;
|
|
22 |
$mStripState = Array();
|
|
23 |
|
|
24 |
$attrib = '[a-zA-Z0-9]';
|
|
25 |
$space = '[\x09\x0a\x0d\x20]';
|
|
26 |
|
|
27 |
define( 'MW_CHAR_REFS_REGEX',
|
|
28 |
'/&([A-Za-z0-9]+);
|
|
29 |
|&\#([0-9]+);
|
|
30 |
|&\#x([0-9A-Za-z]+);
|
|
31 |
|&\#X([0-9A-Za-z]+);
|
|
32 |
|(&)/x' );
|
|
33 |
|
|
34 |
define( 'MW_ATTRIBS_REGEX',
|
|
35 |
"/(?:^|$space)($attrib+)
|
|
36 |
($space*=$space*
|
|
37 |
(?:
|
|
38 |
# The attribute value: quoted or alone
|
|
39 |
".'"'."([^<".'"'."]*)".'"'."
|
|
40 |
| '([^<']*)'
|
|
41 |
| ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
|
|
42 |
| (\#[0-9a-fA-F]+) # Technically wrong, but lots of
|
|
43 |
# colors are specified like this.
|
|
44 |
# We'll be normalizing it.
|
|
45 |
)
|
|
46 |
)?(?=$space|\$)/sx" );
|
|
47 |
|
|
48 |
/**
|
|
49 |
* emulate mediawiki parser, including stripping, etc.
|
|
50 |
*
|
|
51 |
* @param string $text the text to parse
|
|
52 |
* @return string
|
|
53 |
* @access public
|
|
54 |
*/
|
|
55 |
|
|
56 |
function process_tables( $text )
|
|
57 |
{
|
|
58 |
// include some globals, do some parser stuff that would normally be done in the parent parser function
|
|
59 |
global $mStripState;
|
|
60 |
$x =& $mStripState;
|
|
61 |
//$text = mwStrip( $text, $x );
|
|
62 |
|
|
63 |
// parse the text
|
|
64 |
$text = doTableStuff($text);
|
|
65 |
|
|
66 |
// Unstrip it
|
|
67 |
// $text = unstrip( $text, $mStripState );
|
|
68 |
// $text = unstripNoWiki( $text, $mStripState );
|
|
69 |
//die('<pre>'.print_r($mStripState, true).'</pre>');
|
|
70 |
return $text;
|
|
71 |
}
|
|
72 |
|
|
73 |
/**
|
|
74 |
* parse the wiki syntax used to render tables
|
|
75 |
*
|
|
76 |
* @param string $t the text to parse
|
|
77 |
* @return string
|
|
78 |
* @access private
|
|
79 |
*/
|
|
80 |
function doTableStuff( $t ) {
|
|
81 |
|
|
82 |
$t = explode ( "\n" , $t ) ;
|
|
83 |
$td = array () ; # Is currently a td tag open?
|
|
84 |
$ltd = array () ; # Was it TD or TH?
|
|
85 |
$tr = array () ; # Is currently a tr tag open?
|
|
86 |
$ltr = array () ; # tr attributes
|
|
87 |
$has_opened_tr = array(); # Did this table open a <tr> element?
|
|
88 |
$indent_level = 0; # indent level of the table
|
|
89 |
foreach ( $t AS $k => $x )
|
|
90 |
{
|
|
91 |
$x = trim ( $x ) ;
|
|
92 |
$fc = substr ( $x , 0 , 1 ) ;
|
|
93 |
if ( preg_match( '/^(:*)\{\|(.*)$/', $x, $matches ) ) {
|
|
94 |
$indent_level = strlen( $matches[1] );
|
|
95 |
|
|
96 |
$attributes = unstripForHTML( $matches[2] );
|
|
97 |
|
|
98 |
$t[$k] = str_repeat( '<dl><dd>', $indent_level ) .
|
|
99 |
'<nowiki><table' . fixTagAttributes( $attributes, 'table' ) . '></nowiki>' ;
|
|
100 |
array_push ( $td , false ) ;
|
|
101 |
array_push ( $ltd , '' ) ;
|
|
102 |
array_push ( $tr , false ) ;
|
|
103 |
array_push ( $ltr , '' ) ;
|
|
104 |
array_push ( $has_opened_tr, false );
|
|
105 |
}
|
|
106 |
else if ( count ( $td ) == 0 ) { } # Don't do any of the following
|
|
107 |
else if ( '|}' == substr ( $x , 0 , 2 ) ) {
|
|
108 |
$z = "<nowiki></table></nowiki>" . substr ( $x , 2);
|
|
109 |
$l = array_pop ( $ltd ) ;
|
|
110 |
if ( !array_pop ( $has_opened_tr ) ) $z = "<nowiki><tr><td></td></tr></nowiki>" . $z ;
|
|
111 |
if ( array_pop ( $tr ) ) $z = '<nowiki></tr></nowiki>' . $z ;
|
|
112 |
if ( array_pop ( $td ) ) $z = '<nowiki></'.$l.'></nowiki>' . $z ;
|
|
113 |
array_pop ( $ltr ) ;
|
|
114 |
$t[$k] = $z . str_repeat( '<nowiki></dd></dl></nowiki>', $indent_level );
|
|
115 |
}
|
|
116 |
else if ( '|-' == substr ( $x , 0 , 2 ) ) { # Allows for |---------------
|
|
117 |
$x = substr ( $x , 1 ) ;
|
|
118 |
while ( $x != '' && substr ( $x , 0 , 1 ) == '-' ) $x = substr ( $x , 1 ) ;
|
|
119 |
$z = '' ;
|
|
120 |
$l = array_pop ( $ltd ) ;
|
|
121 |
array_pop ( $has_opened_tr );
|
|
122 |
array_push ( $has_opened_tr , true ) ;
|
|
123 |
if ( array_pop ( $tr ) ) $z = '<nowiki></tr></nowiki>' . $z ;
|
|
124 |
if ( array_pop ( $td ) ) $z = '<nowiki></'.$l.'></nowiki>' . $z ;
|
|
125 |
array_pop ( $ltr ) ;
|
|
126 |
$t[$k] = $z ;
|
|
127 |
array_push ( $tr , false ) ;
|
|
128 |
array_push ( $td , false ) ;
|
|
129 |
array_push ( $ltd , '' ) ;
|
|
130 |
$attributes = unstripForHTML( $x );
|
|
131 |
array_push ( $ltr , fixTagAttributes( $attributes, 'tr' ) ) ;
|
|
132 |
}
|
|
133 |
else if ( '|' == $fc || '!' == $fc || '|+' == substr ( $x , 0 , 2 ) ) { # Caption
|
|
134 |
# $x is a table row
|
|
135 |
if ( '|+' == substr ( $x , 0 , 2 ) ) {
|
|
136 |
$fc = '+' ;
|
|
137 |
$x = substr ( $x , 1 ) ;
|
|
138 |
}
|
|
139 |
$after = substr ( $x , 1 ) ;
|
|
140 |
if ( $fc == '!' ) $after = str_replace ( '!!' , '||' , $after ) ;
|
|
141 |
|
|
142 |
// Split up multiple cells on the same line.
|
|
143 |
// FIXME: This can result in improper nesting of tags processed
|
|
144 |
// by earlier parser steps, but should avoid splitting up eg
|
|
145 |
// attribute values containing literal "||".
|
|
146 |
$after = wfExplodeMarkup( '||', $after );
|
|
147 |
|
|
148 |
$t[$k] = '' ;
|
|
149 |
|
|
150 |
# Loop through each table cell
|
|
151 |
foreach ( $after AS $theline )
|
|
152 |
{
|
|
153 |
$z = '' ;
|
|
154 |
if ( $fc != '+' )
|
|
155 |
{
|
|
156 |
$tra = array_pop ( $ltr ) ;
|
|
157 |
if ( !array_pop ( $tr ) ) $z = '<nowiki><tr'.$tra."></nowiki>\n" ;
|
|
158 |
array_push ( $tr , true ) ;
|
|
159 |
array_push ( $ltr , '' ) ;
|
|
160 |
array_pop ( $has_opened_tr );
|
|
161 |
array_push ( $has_opened_tr , true ) ;
|
|
162 |
}
|
|
163 |
|
|
164 |
$l = array_pop ( $ltd ) ;
|
|
165 |
if ( array_pop ( $td ) ) $z = '<nowiki></'.$l.'></nowiki>' . $z ;
|
|
166 |
if ( $fc == '|' ) $l = 'td' ;
|
|
167 |
else if ( $fc == '!' ) $l = 'th' ;
|
|
168 |
else if ( $fc == '+' ) $l = 'caption' ;
|
|
169 |
else $l = '' ;
|
|
170 |
array_push ( $ltd , $l ) ;
|
|
171 |
|
|
172 |
# Cell parameters
|
|
173 |
$y = explode ( '|' , $theline , 2 ) ;
|
|
174 |
# Note that a '|' inside an invalid link should not
|
|
175 |
# be mistaken as delimiting cell parameters
|
|
176 |
if ( strpos( $y[0], '[[' ) !== false ) {
|
|
177 |
$y = array ($theline);
|
|
178 |
}
|
|
179 |
if ( count ( $y ) == 1 )
|
|
180 |
$y = "{$z}<nowiki><{$l}></nowiki>{$y[0]}" ;
|
|
181 |
else {
|
|
182 |
$attributes = unstripForHTML( $y[0] );
|
|
183 |
$y = "{$z}<nowiki><{$l}".fixTagAttributes($attributes, $l)."></nowiki>{$y[1]}" ;
|
|
184 |
}
|
|
185 |
$t[$k] .= $y ;
|
|
186 |
array_push ( $td , true ) ;
|
|
187 |
}
|
|
188 |
}
|
|
189 |
}
|
|
190 |
|
|
191 |
# Closing open td, tr && table
|
|
192 |
while ( count ( $td ) > 0 )
|
|
193 |
{
|
|
194 |
$l = array_pop ( $ltd ) ;
|
|
195 |
if ( array_pop ( $td ) ) $t[] = '<nowiki></td></nowiki>' ;
|
|
196 |
if ( array_pop ( $tr ) ) $t[] = '<nowiki></tr></nowiki>' ;
|
|
197 |
if ( !array_pop ( $has_opened_tr ) ) $t[] = "<nowiki><tr><td></td></tr></nowiki>" ;
|
|
198 |
$t[] = '<nowiki></table></nowiki>' ;
|
|
199 |
}
|
|
200 |
|
|
201 |
$t = implode ( "\n" , $t ) ;
|
|
202 |
|
|
203 |
# special case: don't return empty table
|
|
204 |
if($t == "<nowiki><table></nowiki>\n<nowiki><tr><td></td></tr></nowiki>\n<nowiki></table></nowiki>")
|
|
205 |
$t = '';
|
|
206 |
return $t ;
|
|
207 |
}
|
|
208 |
|
|
209 |
/**
|
|
210 |
* Take a tag soup fragment listing an HTML element's attributes
|
|
211 |
* and normalize it to well-formed XML, discarding unwanted attributes.
|
|
212 |
* Output is safe for further wikitext processing, with escaping of
|
|
213 |
* values that could trigger problems.
|
|
214 |
*
|
|
215 |
* - Normalizes attribute names to lowercase
|
|
216 |
* - Discards attributes not on a whitelist for the given element
|
|
217 |
* - Turns broken or invalid entities into plaintext
|
|
218 |
* - Double-quotes all attribute values
|
|
219 |
* - Attributes without values are given the name as attribute
|
|
220 |
* - Double attributes are discarded
|
|
221 |
* - Unsafe style attributes are discarded
|
|
222 |
* - Prepends space if there are attributes.
|
|
223 |
*
|
|
224 |
* @param string $text
|
|
225 |
* @param string $element
|
|
226 |
* @return string
|
|
227 |
*/
|
|
228 |
function fixTagAttributes( $text, $element ) {
|
|
229 |
if( trim( $text ) == '' ) {
|
|
230 |
return '';
|
|
231 |
}
|
|
232 |
|
|
233 |
$stripped = validateTagAttributes(
|
|
234 |
decodeTagAttributes( $text ), $element );
|
|
235 |
|
|
236 |
$attribs = array();
|
|
237 |
foreach( $stripped as $attribute => $value ) {
|
|
238 |
$encAttribute = htmlspecialchars( $attribute );
|
|
239 |
$encValue = safeEncodeAttribute( $value );
|
|
240 |
|
|
241 |
$attribs[] = "$encAttribute=".'"'."$encValue".'"'.""; // "
|
|
242 |
}
|
|
243 |
return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
|
|
244 |
}
|
|
245 |
|
|
246 |
/**
|
|
247 |
* Encode an attribute value for HTML tags, with extra armoring
|
|
248 |
* against further wiki processing.
|
|
249 |
* @param $text
|
|
250 |
* @return HTML-encoded text fragment
|
|
251 |
*/
|
|
252 |
function safeEncodeAttribute( $text ) {
|
|
253 |
$encValue= encodeAttribute( $text );
|
|
254 |
|
|
255 |
# Templates and links may be expanded in later parsing,
|
|
256 |
# creating invalid or dangerous output. Suppress this.
|
|
257 |
$encValue = strtr( $encValue, array(
|
|
258 |
'<' => '<', // This should never happen,
|
|
259 |
'>' => '>', // we've received invalid input
|
|
260 |
'"' => '"', // which should have been escaped.
|
|
261 |
'{' => '{',
|
|
262 |
'[' => '[',
|
|
263 |
"''" => '''',
|
|
264 |
'ISBN' => 'ISBN',
|
|
265 |
'RFC' => 'RFC',
|
|
266 |
'PMID' => 'PMID',
|
|
267 |
'|' => '|',
|
|
268 |
'__' => '__',
|
|
269 |
) );
|
|
270 |
|
|
271 |
return $encValue;
|
|
272 |
}
|
|
273 |
|
|
274 |
/**
|
|
275 |
* Encode an attribute value for HTML output.
|
|
276 |
* @param $text
|
|
277 |
* @return HTML-encoded text fragment
|
|
278 |
*/
|
|
279 |
function encodeAttribute( $text ) {
|
|
280 |
$encValue = htmlspecialchars( $text );
|
|
281 |
|
|
282 |
// Whitespace is normalized during attribute decoding,
|
|
283 |
// so if we've been passed non-spaces we must encode them
|
|
284 |
// ahead of time or they won't be preserved.
|
|
285 |
$encValue = strtr( $encValue, array(
|
|
286 |
"\n" => ' ',
|
|
287 |
"\r" => ' ',
|
|
288 |
"\t" => '	',
|
|
289 |
) );
|
|
290 |
|
|
291 |
return $encValue;
|
|
292 |
}
|
|
293 |
|
|
294 |
function unstripForHTML( $text ) {
|
|
295 |
global $mStripState;
|
|
296 |
$text = unstrip( $text, $mStripState );
|
|
297 |
$text = unstripNoWiki( $text, $mStripState );
|
|
298 |
return $text;
|
|
299 |
}
|
|
300 |
|
|
301 |
/**
|
|
302 |
* Always call this after unstrip() to preserve the order
|
|
303 |
*
|
|
304 |
* @private
|
|
305 |
*/
|
|
306 |
function unstripNoWiki( $text, &$state ) {
|
|
307 |
if ( !isset( $state['nowiki'] ) ) {
|
|
308 |
return $text;
|
|
309 |
}
|
|
310 |
|
|
311 |
# TODO: good candidate for FSS
|
|
312 |
$text = strtr( $text, $state['nowiki'] );
|
|
313 |
|
|
314 |
return $text;
|
|
315 |
}
|
|
316 |
|
|
317 |
/**
|
|
318 |
* Take an array of attribute names and values and normalize or discard
|
|
319 |
* illegal values for the given element type.
|
|
320 |
*
|
|
321 |
* - Discards attributes not on a whitelist for the given element
|
|
322 |
* - Unsafe style attributes are discarded
|
|
323 |
*
|
|
324 |
* @param array $attribs
|
|
325 |
* @param string $element
|
|
326 |
* @return array
|
|
327 |
*
|
|
328 |
* @todo Check for legal values where the DTD limits things.
|
|
329 |
* @todo Check for unique id attribute :P
|
|
330 |
*/
|
|
331 |
function validateTagAttributes( $attribs, $element ) {
|
|
332 |
$whitelist = array_flip( attributeWhitelist( $element ) );
|
|
333 |
$out = array();
|
|
334 |
foreach( $attribs as $attribute => $value ) {
|
|
335 |
if( !isset( $whitelist[$attribute] ) ) {
|
|
336 |
continue;
|
|
337 |
}
|
|
338 |
# Strip javascript "expression" from stylesheets.
|
|
339 |
# http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
|
|
340 |
if( $attribute == 'style' ) {
|
|
341 |
$value = checkCss( $value );
|
|
342 |
if( $value === false ) {
|
|
343 |
# haxx0r
|
|
344 |
continue;
|
|
345 |
}
|
|
346 |
}
|
|
347 |
|
|
348 |
if ( $attribute === 'id' )
|
|
349 |
$value = escapeId( $value );
|
|
350 |
|
|
351 |
// If this attribute was previously set, override it.
|
|
352 |
// Output should only have one attribute of each name.
|
|
353 |
$out[$attribute] = $value;
|
|
354 |
}
|
|
355 |
return $out;
|
|
356 |
}
|
|
357 |
|
|
358 |
/**
|
|
359 |
* Pick apart some CSS and check it for forbidden or unsafe structures.
|
|
360 |
* Returns a sanitized string, or false if it was just too evil.
|
|
361 |
*
|
|
362 |
* Currently URL references, 'expression', 'tps' are forbidden.
|
|
363 |
*
|
|
364 |
* @param string $value
|
|
365 |
* @return mixed
|
|
366 |
*/
|
|
367 |
function checkCss( $value ) {
|
|
368 |
$stripped = decodeCharReferences( $value );
|
|
369 |
|
|
370 |
// Remove any comments; IE gets token splitting wrong
|
|
371 |
$stripped = preg_replace( '!/\\*.*?\\*/!S', '', $stripped );
|
|
372 |
$value = $stripped;
|
|
373 |
|
|
374 |
// ... and continue checks
|
|
375 |
$stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
|
|
376 |
'codepointToUtf8(hexdec("$1"))', $stripped );
|
|
377 |
$stripped = str_replace( '\\', '', $stripped );
|
|
378 |
if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
|
|
379 |
$stripped ) ) {
|
|
380 |
# haxx0r
|
|
381 |
return false;
|
|
382 |
}
|
|
383 |
|
|
384 |
return $value;
|
|
385 |
}
|
|
386 |
|
|
387 |
/**
|
|
388 |
* Decode any character references, numeric or named entities,
|
|
389 |
* in the text and return a UTF-8 string.
|
|
390 |
*
|
|
391 |
* @param string $text
|
|
392 |
* @return string
|
|
393 |
* @access public
|
|
394 |
* @static
|
|
395 |
*/
|
|
396 |
function decodeCharReferences( $text ) {
|
|
397 |
return preg_replace_callback(
|
|
398 |
MW_CHAR_REFS_REGEX,
|
|
399 |
'decodeCharReferencesCallback',
|
|
400 |
$text );
|
|
401 |
}
|
|
402 |
|
|
403 |
/**
|
|
404 |
* Fetch the whitelist of acceptable attributes for a given
|
|
405 |
* element name.
|
|
406 |
*
|
|
407 |
* @param string $element
|
|
408 |
* @return array
|
|
409 |
*/
|
|
410 |
function attributeWhitelist( $element ) {
|
|
411 |
static $list;
|
|
412 |
if( !isset( $list ) ) {
|
|
413 |
$list = setupAttributeWhitelist();
|
|
414 |
}
|
|
415 |
return isset( $list[$element] )
|
|
416 |
? $list[$element]
|
|
417 |
: array();
|
|
418 |
}
|
|
419 |
|
|
420 |
/**
|
|
421 |
* @todo Document it a bit
|
|
422 |
* @return array
|
|
423 |
*/
|
|
424 |
function setupAttributeWhitelist() {
|
|
425 |
$common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
|
|
426 |
$block = array_merge( $common, array( 'align' ) );
|
|
427 |
$tablealign = array( 'align', 'char', 'charoff', 'valign' );
|
|
428 |
$tablecell = array( 'abbr',
|
|
429 |
'axis',
|
|
430 |
'headers',
|
|
431 |
'scope',
|
|
432 |
'rowspan',
|
|
433 |
'colspan',
|
|
434 |
'nowrap', # deprecated
|
|
435 |
'width', # deprecated
|
|
436 |
'height', # deprecated
|
|
437 |
'bgcolor' # deprecated
|
|
438 |
);
|
|
439 |
|
|
440 |
# Numbers refer to sections in HTML 4.01 standard describing the element.
|
|
441 |
# See: http://www.w3.org/TR/html4/
|
|
442 |
$whitelist = array (
|
|
443 |
# 7.5.4
|
|
444 |
'div' => $block,
|
|
445 |
'center' => $common, # deprecated
|
|
446 |
'span' => $block, # ??
|
|
447 |
|
|
448 |
# 7.5.5
|
|
449 |
'h1' => $block,
|
|
450 |
'h2' => $block,
|
|
451 |
'h3' => $block,
|
|
452 |
'h4' => $block,
|
|
453 |
'h5' => $block,
|
|
454 |
'h6' => $block,
|
|
455 |
|
|
456 |
# 7.5.6
|
|
457 |
# address
|
|
458 |
|
|
459 |
# 8.2.4
|
|
460 |
# bdo
|
|
461 |
|
|
462 |
# 9.2.1
|
|
463 |
'em' => $common,
|
|
464 |
'strong' => $common,
|
|
465 |
'cite' => $common,
|
|
466 |
# dfn
|
|
467 |
'code' => $common,
|
|
468 |
# samp
|
|
469 |
# kbd
|
|
470 |
'var' => $common,
|
|
471 |
# abbr
|
|
472 |
# acronym
|
|
473 |
|
|
474 |
# 9.2.2
|
|
475 |
'blockquote' => array_merge( $common, array( 'cite' ) ),
|
|
476 |
# q
|
|
477 |
|
|
478 |
# 9.2.3
|
|
479 |
'sub' => $common,
|
|
480 |
'sup' => $common,
|
|
481 |
|
|
482 |
# 9.3.1
|
|
483 |
'p' => $block,
|
|
484 |
|
|
485 |
# 9.3.2
|
|
486 |
'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
|
|
487 |
|
|
488 |
# 9.3.4
|
|
489 |
'pre' => array_merge( $common, array( 'width' ) ),
|
|
490 |
|
|
491 |
# 9.4
|
|
492 |
'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
|
|
493 |
'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
|
|
494 |
|
|
495 |
# 10.2
|
|
496 |
'ul' => array_merge( $common, array( 'type' ) ),
|
|
497 |
'ol' => array_merge( $common, array( 'type', 'start' ) ),
|
|
498 |
'li' => array_merge( $common, array( 'type', 'value' ) ),
|
|
499 |
|
|
500 |
# 10.3
|
|
501 |
'dl' => $common,
|
|
502 |
'dd' => $common,
|
|
503 |
'dt' => $common,
|
|
504 |
|
|
505 |
# 11.2.1
|
|
506 |
'table' => array_merge( $common,
|
|
507 |
array( 'summary', 'width', 'border', 'frame',
|
|
508 |
'rules', 'cellspacing', 'cellpadding',
|
|
509 |
'align', 'bgcolor',
|
|
510 |
) ),
|
|
511 |
|
|
512 |
# 11.2.2
|
|
513 |
'caption' => array_merge( $common, array( 'align' ) ),
|
|
514 |
|
|
515 |
# 11.2.3
|
|
516 |
'thead' => array_merge( $common, $tablealign ),
|
|
517 |
'tfoot' => array_merge( $common, $tablealign ),
|
|
518 |
'tbody' => array_merge( $common, $tablealign ),
|
|
519 |
|
|
520 |
# 11.2.4
|
|
521 |
'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
|
|
522 |
'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
|
|
523 |
|
|
524 |
# 11.2.5
|
|
525 |
'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
|
|
526 |
|
|
527 |
# 11.2.6
|
|
528 |
'td' => array_merge( $common, $tablecell, $tablealign ),
|
|
529 |
'th' => array_merge( $common, $tablecell, $tablealign ),
|
|
530 |
|
|
531 |
# 12.2
|
|
532 |
# added by dan
|
|
533 |
'a' => array_merge( $common, array( 'href', 'name' ) ),
|
|
534 |
|
|
535 |
# 13.2
|
|
536 |
# added by dan
|
|
537 |
'img' => array_merge( $common, array( 'src', 'width', 'height', 'alt' ) ),
|
|
538 |
|
|
539 |
# 15.2.1
|
|
540 |
'tt' => $common,
|
|
541 |
'b' => $common,
|
|
542 |
'i' => $common,
|
|
543 |
'big' => $common,
|
|
544 |
'small' => $common,
|
|
545 |
'strike' => $common,
|
|
546 |
's' => $common,
|
|
547 |
'u' => $common,
|
|
548 |
|
|
549 |
# 15.2.2
|
|
550 |
'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
|
|
551 |
# basefont
|
|
552 |
|
|
553 |
# 15.3
|
|
554 |
'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
|
|
555 |
|
|
556 |
# XHTML Ruby annotation text module, simple ruby only.
|
|
557 |
# http://www.w3c.org/TR/ruby/
|
|
558 |
'ruby' => $common,
|
|
559 |
# rbc
|
|
560 |
# rtc
|
|
561 |
'rb' => $common,
|
|
562 |
'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
|
|
563 |
'rp' => $common,
|
|
564 |
|
|
565 |
# For compatibility with the XHTML parser.
|
|
566 |
'nowiki' => array(),
|
|
567 |
'noinclude' => array(),
|
|
568 |
'nodisplay' => array(),
|
|
569 |
|
|
570 |
# XHTML stuff
|
|
571 |
'acronym' => $common
|
|
572 |
);
|
|
573 |
return $whitelist;
|
|
574 |
}
|
|
575 |
|
|
576 |
/**
|
|
577 |
* Given a value escape it so that it can be used in an id attribute and
|
|
578 |
* return it, this does not validate the value however (see first link)
|
|
579 |
*
|
|
580 |
* @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
|
|
581 |
* in the id and
|
|
582 |
* name attributes
|
|
583 |
* @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
|
|
584 |
*
|
|
585 |
* @bug 4461
|
|
586 |
*
|
|
587 |
* @static
|
|
588 |
*
|
|
589 |
* @param string $id
|
|
590 |
* @return string
|
|
591 |
*/
|
|
592 |
function escapeId( $id ) {
|
|
593 |
static $replace = array(
|
|
594 |
'%3A' => ':',
|
|
595 |
'%' => '.'
|
|
596 |
);
|
|
597 |
|
|
598 |
$id = urlencode( decodeCharReferences( strtr( $id, ' ', '_' ) ) );
|
|
599 |
|
|
600 |
return str_replace( array_keys( $replace ), array_values( $replace ), $id );
|
|
601 |
}
|
|
602 |
|
|
603 |
/**
|
|
604 |
* More or less "markup-safe" explode()
|
|
605 |
* Ignores any instances of the separator inside <...>
|
|
606 |
* @param string $separator
|
|
607 |
* @param string $text
|
|
608 |
* @return array
|
|
609 |
*/
|
|
610 |
function wfExplodeMarkup( $separator, $text ) {
|
|
611 |
$placeholder = "\x00";
|
|
612 |
|
|
613 |
// Just in case...
|
|
614 |
$text = str_replace( $placeholder, '', $text );
|
|
615 |
|
|
616 |
// Trim stuff
|
|
617 |
$replacer = new ReplacerCallback( $separator, $placeholder );
|
|
618 |
$cleaned = preg_replace_callback( '/(<.*?>)/', array( $replacer, 'go' ), $text );
|
|
619 |
|
|
620 |
$items = explode( $separator, $cleaned );
|
|
621 |
foreach( $items as $i => $str ) {
|
|
622 |
$items[$i] = str_replace( $placeholder, $separator, $str );
|
|
623 |
}
|
|
624 |
|
|
625 |
return $items;
|
|
626 |
}
|
|
627 |
|
|
628 |
class ReplacerCallback {
|
|
629 |
function ReplacerCallback( $from, $to ) {
|
|
630 |
$this->from = $from;
|
|
631 |
$this->to = $to;
|
|
632 |
}
|
|
633 |
|
|
634 |
function go( $matches ) {
|
|
635 |
return str_replace( $this->from, $this->to, $matches[1] );
|
|
636 |
}
|
|
637 |
}
|
|
638 |
|
|
639 |
/**
|
|
640 |
* Return an associative array of attribute names and values from
|
|
641 |
* a partial tag string. Attribute names are forces to lowercase,
|
|
642 |
* character references are decoded to UTF-8 text.
|
|
643 |
*
|
|
644 |
* @param string
|
|
645 |
* @return array
|
|
646 |
*/
|
|
647 |
function decodeTagAttributes( $text ) {
|
|
648 |
$attribs = array();
|
|
649 |
|
|
650 |
if( trim( $text ) == '' ) {
|
|
651 |
return $attribs;
|
|
652 |
}
|
|
653 |
|
|
654 |
$pairs = array();
|
|
655 |
if( !preg_match_all(
|
|
656 |
MW_ATTRIBS_REGEX,
|
|
657 |
$text,
|
|
658 |
$pairs,
|
|
659 |
PREG_SET_ORDER ) ) {
|
|
660 |
return $attribs;
|
|
661 |
}
|
|
662 |
|
|
663 |
foreach( $pairs as $set ) {
|
|
664 |
$attribute = strtolower( $set[1] );
|
|
665 |
$value = getTagAttributeCallback( $set );
|
|
666 |
|
|
667 |
// Normalize whitespace
|
|
668 |
$value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
|
|
669 |
$value = trim( $value );
|
|
670 |
|
|
671 |
// Decode character references
|
|
672 |
$attribs[$attribute] = decodeCharReferences( $value );
|
|
673 |
}
|
|
674 |
return $attribs;
|
|
675 |
}
|
|
676 |
|
|
677 |
/**
|
|
678 |
* Pick the appropriate attribute value from a match set from the
|
|
679 |
* MW_ATTRIBS_REGEX matches.
|
|
680 |
*
|
|
681 |
* @param array $set
|
|
682 |
* @return string
|
|
683 |
* @access private
|
|
684 |
*/
|
|
685 |
function getTagAttributeCallback( $set ) {
|
|
686 |
if( isset( $set[6] ) ) {
|
|
687 |
# Illegal #XXXXXX color with no quotes.
|
|
688 |
return $set[6];
|
|
689 |
} elseif( isset( $set[5] ) ) {
|
|
690 |
# No quotes.
|
|
691 |
return $set[5];
|
|
692 |
} elseif( isset( $set[4] ) ) {
|
|
693 |
# Single-quoted
|
|
694 |
return $set[4];
|
|
695 |
} elseif( isset( $set[3] ) ) {
|
|
696 |
# Double-quoted
|
|
697 |
return $set[3];
|
|
698 |
} elseif( !isset( $set[2] ) ) {
|
|
699 |
# In XHTML, attributes must have a value.
|
|
700 |
# For 'reduced' form, return explicitly the attribute name here.
|
|
701 |
return $set[1];
|
|
702 |
} else {
|
|
703 |
die_friendly('Parser error', "<p>Tag conditions not met. This should never happen and is a bug.</p>" );
|
|
704 |
}
|
|
705 |
}
|
|
706 |
|
|
707 |
/**
|
|
708 |
* Strips and renders nowiki, pre, math, hiero
|
|
709 |
* If $render is set, performs necessary rendering operations on plugins
|
|
710 |
* Returns the text, and fills an array with data needed in unstrip()
|
|
711 |
* If the $state is already a valid strip state, it adds to the state
|
|
712 |
*
|
|
713 |
* @param bool $stripcomments when set, HTML comments <!-- like this -->
|
|
714 |
* will be stripped in addition to other tags. This is important
|
|
715 |
* for section editing, where these comments cause confusion when
|
|
716 |
* counting the sections in the wikisource
|
|
717 |
*
|
|
718 |
* @param array dontstrip contains tags which should not be stripped;
|
|
719 |
* used to prevent stipping of <gallery> when saving (fixes bug 2700)
|
|
720 |
*
|
|
721 |
* @access private
|
|
722 |
*/
|
|
723 |
function mwStrip( $text, &$state, $stripcomments = false , $dontstrip = array () ) {
|
|
724 |
global $wgRandomKey;
|
|
725 |
$render = true;
|
|
726 |
|
|
727 |
$wgRandomKey = "\x07UNIQ" . dechex(mt_rand(0, 0x7fffffff)) . dechex(mt_rand(0, 0x7fffffff));
|
|
728 |
$uniq_prefix =& $wgRandomKey;
|
|
729 |
$commentState = array();
|
|
730 |
|
|
731 |
$elements = array( 'nowiki', 'gallery' );
|
|
732 |
|
|
733 |
# Removing $dontstrip tags from $elements list (currently only 'gallery', fixing bug 2700)
|
|
734 |
foreach ( $elements AS $k => $v ) {
|
|
735 |
if ( !in_array ( $v , $dontstrip ) ) continue;
|
|
736 |
unset ( $elements[$k] );
|
|
737 |
}
|
|
738 |
|
|
739 |
$matches = array();
|
|
740 |
$text = extractTagsAndParams( $elements, $text, $matches, $uniq_prefix );
|
|
741 |
|
|
742 |
foreach( $matches as $marker => $data ) {
|
|
743 |
list( $element, $content, $params, $tag ) = $data;
|
|
744 |
if( $render ) {
|
|
745 |
$tagName = strtolower( $element );
|
|
746 |
switch( $tagName ) {
|
|
747 |
case '!--':
|
|
748 |
// Comment
|
|
749 |
if( substr( $tag, -3 ) == '-->' ) {
|
|
750 |
$output = $tag;
|
|
751 |
} else {
|
|
752 |
// Unclosed comment in input.
|
|
753 |
// Close it so later stripping can remove it
|
|
754 |
$output = "$tag-->";
|
|
755 |
}
|
|
756 |
break;
|
|
757 |
case 'html':
|
|
758 |
if( $wgRawHtml ) {
|
|
759 |
$output = $content;
|
|
760 |
break;
|
|
761 |
}
|
|
762 |
// Shouldn't happen otherwise. :)
|
|
763 |
case 'nowiki':
|
|
764 |
$output = wfEscapeHTMLTagsOnly( $content );
|
|
765 |
break;
|
|
766 |
default:
|
|
767 |
}
|
|
768 |
} else {
|
|
769 |
// Just stripping tags; keep the source
|
|
770 |
$output = $tag;
|
|
771 |
}
|
|
772 |
|
|
773 |
// Unstrip the output, because unstrip() is no longer recursive so
|
|
774 |
// it won't do it itself
|
|
775 |
$output = unstrip( $output, $state );
|
|
776 |
|
|
777 |
if( !$stripcomments && $element == '!--' ) {
|
|
778 |
$commentState[$marker] = $output;
|
|
779 |
} elseif ( $element == 'html' || $element == 'nowiki' ) {
|
|
780 |
$state['nowiki'][$marker] = $output;
|
|
781 |
} else {
|
|
782 |
$state['general'][$marker] = $output;
|
|
783 |
}
|
|
784 |
}
|
|
785 |
|
|
786 |
# Unstrip comments unless explicitly told otherwise.
|
|
787 |
# (The comments are always stripped prior to this point, so as to
|
|
788 |
# not invoke any extension tags / parser hooks contained within
|
|
789 |
# a comment.)
|
|
790 |
if ( !$stripcomments ) {
|
|
791 |
// Put them all back and forget them
|
|
792 |
$text = strtr( $text, $commentState );
|
|
793 |
}
|
|
794 |
|
|
795 |
return $text;
|
|
796 |
}
|
|
797 |
|
|
798 |
/**
|
|
799 |
* Replaces all occurrences of HTML-style comments and the given tags
|
|
800 |
* in the text with a random marker and returns teh next text. The output
|
|
801 |
* parameter $matches will be an associative array filled with data in
|
|
802 |
* the form:
|
|
803 |
* 'UNIQ-xxxxx' => array(
|
|
804 |
* 'element',
|
|
805 |
* 'tag content',
|
|
806 |
* array( 'param' => 'x' ),
|
|
807 |
* '<element param="x">tag content</element>' ) )
|
|
808 |
*
|
|
809 |
* @param $elements list of element names. Comments are always extracted.
|
|
810 |
* @param $text Source text string.
|
|
811 |
* @param $uniq_prefix
|
|
812 |
*
|
|
813 |
* @access private
|
|
814 |
* @static
|
|
815 |
*/
|
|
816 |
function extractTagsAndParams($elements, $text, &$matches, $uniq_prefix = ''){
|
|
817 |
static $n = 1;
|
|
818 |
$stripped = '';
|
|
819 |
$matches = array();
|
|
820 |
|
|
821 |
$taglist = implode( '|', $elements );
|
|
822 |
$start = "/<($taglist)(\\s+[^>]*?|\\s*?)(\/?>)|<(!--)/i";
|
|
823 |
|
|
824 |
while ( '' != $text ) {
|
|
825 |
$p = preg_split( $start, $text, 2, PREG_SPLIT_DELIM_CAPTURE );
|
|
826 |
$stripped .= $p[0];
|
|
827 |
if( count( $p ) < 5 ) {
|
|
828 |
break;
|
|
829 |
}
|
|
830 |
if( count( $p ) > 5 ) {
|
|
831 |
// comment
|
|
832 |
$element = $p[4];
|
|
833 |
$attributes = '';
|
|
834 |
$close = '';
|
|
835 |
$inside = $p[5];
|
|
836 |
} else {
|
|
837 |
// tag
|
|
838 |
$element = $p[1];
|
|
839 |
$attributes = $p[2];
|
|
840 |
$close = $p[3];
|
|
841 |
$inside = $p[4];
|
|
842 |
}
|
|
843 |
|
|
844 |
$marker = "$uniq_prefix-$element-" . sprintf('%08X', $n++) . '-QINU';
|
|
845 |
$stripped .= $marker;
|
|
846 |
|
|
847 |
if ( $close === '/>' ) {
|
|
848 |
// Empty element tag, <tag />
|
|
849 |
$content = null;
|
|
850 |
$text = $inside;
|
|
851 |
$tail = null;
|
|
852 |
} else {
|
|
853 |
if( $element == '!--' ) {
|
|
854 |
$end = '/(-->)/';
|
|
855 |
} else {
|
|
856 |
$end = "/(<\\/$element\\s*>)/i";
|
|
857 |
}
|
|
858 |
$q = preg_split( $end, $inside, 2, PREG_SPLIT_DELIM_CAPTURE );
|
|
859 |
$content = $q[0];
|
|
860 |
if( count( $q ) < 3 ) {
|
|
861 |
# No end tag -- let it run out to the end of the text.
|
|
862 |
$tail = '';
|
|
863 |
$text = '';
|
|
864 |
} else {
|
|
865 |
$tail = $q[1];
|
|
866 |
$text = $q[2];
|
|
867 |
}
|
|
868 |
}
|
|
869 |
|
|
870 |
$matches[$marker] = array( $element,
|
|
871 |
$content,
|
|
872 |
decodeTagAttributes( $attributes ),
|
|
873 |
"<$element$attributes$close$content$tail" );
|
|
874 |
}
|
|
875 |
return $stripped;
|
|
876 |
}
|
|
877 |
|
|
878 |
/**
|
|
879 |
* Escape html tags
|
|
880 |
* Basically replacing " > and < with HTML entities ( ", >, <)
|
|
881 |
*
|
|
882 |
* @param $in String: text that might contain HTML tags.
|
|
883 |
* @return string Escaped string
|
|
884 |
*/
|
|
885 |
function wfEscapeHTMLTagsOnly( $in ) {
|
|
886 |
return str_replace(
|
|
887 |
array( '"', '>', '<' ),
|
|
888 |
array( '"', '>', '<' ),
|
|
889 |
$in );
|
|
890 |
}
|
|
891 |
|
|
892 |
/**
|
|
893 |
* Restores pre, math, and other extensions removed by strip()
|
|
894 |
*
|
|
895 |
* always call unstripNoWiki() after this one
|
|
896 |
* @private
|
|
897 |
*/
|
|
898 |
function unstrip( $text, &$state ) {
|
|
899 |
if ( !isset( $state['general'] ) ) {
|
|
900 |
return $text;
|
|
901 |
}
|
|
902 |
|
|
903 |
# TODO: good candidate for FSS
|
|
904 |
$text = strtr( $text, $state['general'] );
|
|
905 |
|
|
906 |
return $text;
|
|
907 |
}
|
|
908 |
|
|
909 |
/**
|
|
910 |
* Return UTF-8 string for a codepoint if that is a valid
|
|
911 |
* character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
|
|
912 |
* @param int $codepoint
|
|
913 |
* @return string
|
|
914 |
* @private
|
|
915 |
*/
|
|
916 |
function decodeChar( $codepoint ) {
|
|
917 |
if( validateCodepoint( $codepoint ) ) {
|
|
918 |
return codepointToUtf8( $codepoint );
|
|
919 |
} else {
|
|
920 |
return UTF8_REPLACEMENT;
|
|
921 |
}
|
|
922 |
}
|
|
923 |
|
|
924 |
/**
|
|
925 |
* If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
|
|
926 |
* return the UTF-8 encoding of that character. Otherwise, returns
|
|
927 |
* pseudo-entity source (eg &foo;)
|
|
928 |
*
|
|
929 |
* @param string $name
|
|
930 |
* @return string
|
|
931 |
*/
|
|
932 |
function decodeEntity( $name ) {
|
|
933 |
global $wgHtmlEntities;
|
|
934 |
if( isset( $wgHtmlEntities[$name] ) ) {
|
|
935 |
return codepointToUtf8( $wgHtmlEntities[$name] );
|
|
936 |
} else {
|
|
937 |
return "&$name;";
|
|
938 |
}
|
|
939 |
}
|
|
940 |
|
|
941 |
/**
|
|
942 |
* Returns true if a given Unicode codepoint is a valid character in XML.
|
|
943 |
* @param int $codepoint
|
|
944 |
* @return bool
|
|
945 |
*/
|
|
946 |
function validateCodepoint( $codepoint ) {
|
|
947 |
return ($codepoint == 0x09)
|
|
948 |
|| ($codepoint == 0x0a)
|
|
949 |
|| ($codepoint == 0x0d)
|
|
950 |
|| ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
|
|
951 |
|| ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
|
|
952 |
|| ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
|
|
953 |
}
|
|
954 |
|
|
955 |
/**
|
|
956 |
* Return UTF-8 sequence for a given Unicode code point.
|
|
957 |
* May die if fed out of range data.
|
|
958 |
*
|
|
959 |
* @param $codepoint Integer:
|
|
960 |
* @return String
|
|
961 |
* @public
|
|
962 |
*/
|
|
963 |
function codepointToUtf8( $codepoint ) {
|
|
964 |
if($codepoint < 0x80) return chr($codepoint);
|
|
965 |
if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f | 0xc0) .
|
|
966 |
chr($codepoint & 0x3f | 0x80);
|
|
967 |
if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) .
|
|
968 |
chr($codepoint >> 6 & 0x3f | 0x80) .
|
|
969 |
chr($codepoint & 0x3f | 0x80);
|
|
970 |
if($codepoint < 0x110000) return chr($codepoint >> 18 & 0x07 | 0xf0) .
|
|
971 |
chr($codepoint >> 12 & 0x3f | 0x80) .
|
|
972 |
chr($codepoint >> 6 & 0x3f | 0x80) .
|
|
973 |
chr($codepoint & 0x3f | 0x80);
|
|
974 |
|
|
975 |
echo "Asked for code outside of range ($codepoint)\n";
|
|
976 |
die( -1 );
|
|
977 |
}
|
|
978 |
|
|
979 |
/**
|
|
980 |
* @param string $matches
|
|
981 |
* @return string
|
|
982 |
*/
|
|
983 |
function decodeCharReferencesCallback( $matches ) {
|
|
984 |
if( $matches[1] != '' ) {
|
|
985 |
return Sanitizer::decodeEntity( $matches[1] );
|
|
986 |
} elseif( $matches[2] != '' ) {
|
|
987 |
return Sanitizer::decodeChar( intval( $matches[2] ) );
|
|
988 |
} elseif( $matches[3] != '' ) {
|
|
989 |
return Sanitizer::decodeChar( hexdec( $matches[3] ) );
|
|
990 |
} elseif( $matches[4] != '' ) {
|
|
991 |
return Sanitizer::decodeChar( hexdec( $matches[4] ) );
|
|
992 |
}
|
|
993 |
# Last case should be an ampersand by itself
|
|
994 |
return $matches[0];
|
|
995 |
}
|
|
996 |
|
|
997 |
?>
|