Файловый менеджер - Редактировать - /var/www/html/libraries/kunena/External/Nbbc/src/BBCodeLexer.php
Ðазад
<?php /** * @copyright Copyright (C) 2008-10, Sean Werkema. All rights reserved. * @copyright 2016 Vanilla Forums Inc. (changes only) * @license BSDv2 */ namespace Nbbc; //----------------------------------------------------------------------------- // // nbbc_lex.php // // This file is part of NBBC, the New BBCode Parser. // // NBBC implements a fully-validating, high-speed, extensible parser for the // BBCode document language. Its output is XHTML 1.0 Strict conformant no // matter what its input is. NBBC supports the full standard BBCode language, // as well as comments, columns, enhanced quotes, spoilers, acronyms, wiki // links, several list styles, justification, indentation, and smileys, among // other advanced features. // //----------------------------------------------------------------------------- // // Copyright (c) 2008-9, the Phantom Inker. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in // the documentation and/or other materials provided with the // distribution. // // THIS SOFTWARE IS PROVIDED BY THE PHANTOM INKER "AS IS" AND ANY EXPRESS // OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR // BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE // OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN // IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // //----------------------------------------------------------------------------- // // This file implements the NBBC lexical analyzer, which breaks down the // input text from characters into tokens. This uses an event-based // interface, somewhat like lex or flex uses, wherein each time // $this->NextToken is called, the next token is returned until it returns // BBCODE_EOI at the end of the input. // //----------------------------------------------------------------------------- class BBCodeLexer { /** * Lexer: Next token is plain text. */ const BBCODE_LEXSTATE_TEXT = 0; /** * Lexer: Next token is non-text element. */ const BBCODE_LEXSTATE_TAG = 1; public $token; // Return token type: One of the BBCODE_* constants. public $text; // Actual exact, original text of token. public $tag; // If token is a tag, this is the decoded array version. public $state; // Next state of the lexer's state machine: text, or tag/ws/nl public $input; // The input string, split into an array of tokens. public $ptr; // Read pointer into the input array. public $unget; // Whether to "unget" the last token. public $verbatim; // In verbatim mode, we return all input, unparsed, including comments. public $debug; // In debug mode, we dump decoded tags when we find them. public $tagmarker; // Which kind of tag marker we're using: "[", "<", "(", or "{" public $end_tagmarker; // The ending tag marker: "]", ">", "(", or "{" public $pat_main; // Main tag-matching pattern. public $pat_comment; // Pattern for matching comments. public $pat_comment2; // Pattern for matching comments. public $pat_wiki; // Pattern for matching wiki-links. /** * Instantiate a new instance of the {@link BBCodeLexer} class. * * @param string $string The string to be broken up into tokens. * @param string $tagmarker The BBCode tag marker. */ public function __construct($string, $tagmarker = '[') { // First thing we do is to split the input string into tuples of // text and tags. This will make it easy to tokenize. We define a tag as // anything starting with a [, ending with a ], and containing no [ or ] in // between unless surrounded by "" or '', and containing no newlines. // We also separate out whitespace and newlines. // Choose a tag marker based on the possible tag markers. $regex_beginmarkers = ['[' => '\[', '<' => '<', '{' => '\{', '(' => '\(']; $regex_endmarkers = ['[' => '\]', '<' => '>', '{' => '\}', '(' => '\)']; $endmarkers = ['[' => ']', '<' => '>', '{' => '}', '(' => ')']; if (!isset($regex_endmarkers[$tagmarker])) { $tagmarker = '['; } $e = $regex_endmarkers[$tagmarker]; $b = $regex_beginmarkers[$tagmarker]; $this->tagmarker = $tagmarker; $this->end_tagmarker = $endmarkers[$tagmarker]; // $this->input will be an array of tokens, with the special property that // the elements strictly alternate between plain text and tags/whitespace/newlines, // and that tags always have *two* entries per tag. The first element will // always be plain text. Note that the regexes below make VERY heavy use of // PCRE regex-syntax extensions, so don't even try to modify them unless you // know how things like (?!) and (?:) and (?=) work. We use the /x modifier // here to make this a *lot* more legible and debuggable. // Is PHP crashing? Are you on Windows? Did you trace the crash to here? // Add ThreadStackSize 8388608 to the <IfModule mpm_winnt_module> section // of your conf\extra\httpd-mpm.conf file. // More info, see: // http://stackoverflow.com/questions/5058845/how-do-i-increase-the-stack-size-for-apache-running-under-windows-7 $this->pat_main = "/( " // Match tags, as long as they do not start with [-- or [' or [!-- or [rem or [[. // Tags may contain "quoted" or 'quoted' sections that may contain [ or ] characters. // Tags may not contain newlines. ."{$b}" ."(?! -- | ' | !-- | {$b}{$b} )" ."(?: [^\\n\\r{$b}{$e}] | \\\" [^\\\"\\n\\r]* \\\" | \\' [^\\'\\n\\r]* \\' )*" ."{$e}" // Match wiki-links, which are of the form [[...]] or [[...|...]]. Unlike // tags, wiki-links treat " and ' marks as normal input characters; but they // still may not contain newlines. ."| {$b}{$b} (?: [^{$e}\\r\\n] | {$e}[^{$e}\\r\\n] ){1,256} {$e}{$e}" // Match single-line comments, which start with [-- or [' or [rem . ."| {$b} (?: -- | ' ) (?: [^{$e}\\n\\r]* ) {$e}" // Match multi-line comments, which start with [!-- and end with --] and contain // no --] in between. ."| {$b}!-- (?: [^-] | -[^-] | --[^{$e}] )* --{$e}" // Match five or more hyphens as a special token, which gets returned as a [rule] tag. ."| -----+" // Match newlines, in all four possible forms. ."| \\x0D\\x0A | \\x0A\\x0D | \\x0D | \\x0A" // Match whitespace, but only if it butts up against a newline, rule, or // bracket on at least one end. ."| [\\x00-\\x09\\x0B-\\x0C\\x0E-\\x20]+(?=[\\x0D\\x0A{$b}]|-----|$)" ."| (?<=[\\x0D\\x0A{$e}]|-----|^)[\\x00-\\x09\\x0B-\\x0C\\x0E-\\x20]+" ." )/Dx"; $this->input = preg_split($this->pat_main, $string, -1, PREG_SPLIT_DELIM_CAPTURE); if ($this->input === false) { $errorMessage = "Error lexing BBCode contents."; if (function_exists('preg_last_error_msg')) { $errorMessage .= " " . preg_last_error_msg(); } throw new BBCodeException($errorMessage); } // Patterns for matching specific types of tokens during lexing. $this->pat_comment = "/^ {$b} (?: -- | ' ) /Dx"; $this->pat_comment2 = "/^ {$b}!-- (?: [^-] | -[^-] | --[^{$e}] )* --{$e} $/Dx"; $this->pat_wiki = "/^ {$b}{$b} ([^\\|]*) (?:\\|(.*))? {$e}{$e} $/Dx"; // Current lexing state. $this->ptr = 0; $this->unget = false; $this->state = self::BBCODE_LEXSTATE_TEXT; $this->verbatim = false; // Return values. $this->token = BBCode::BBCODE_EOI; $this->tag = false; $this->text = ""; } /** * Compute how many non-tag characters there are in the input, give or take a few. * * This is optimized for speed, not accuracy, so it'll get some stuff like * horizontal rules and weird whitespace characters wrong, but it's only supposed * to provide a rough quick guess, not a hard fact. * * @return int Returns the approximate text length. */ public function guessTextLength() { $length = 0; $ptr = 0; $state = self::BBCODE_LEXSTATE_TEXT; // Loop until we find a valid (nonempty) token. while ($ptr < count($this->input)) { $text = $this->input[$ptr++]; if ($state == self::BBCODE_LEXSTATE_TEXT) { $state = self::BBCODE_LEXSTATE_TAG; $length += strlen($text); } else { // Hack by Kunena : to remove the deprected message with Php 8.5 if (!empty(substr($this->text, 0, 1))) { // switch (ord(substr($this->text, 0, 1))) { case 10: case 13: $state = self::BBCODE_LEXSTATE_TEXT; $length++; break; default: $state = self::BBCODE_LEXSTATE_TEXT; $length += strlen($text); break; case 40: case 60: case 91: case 123: $state = self::BBCODE_LEXSTATE_TEXT; break; } // End Hack by Kunena : to remove the deprected message with Php 8.5 } // } } return $length; } /** * Return the type of the next token, either BBCODE_TAG or BBCODE_TEXT or BBCODE_EOI. * * This stores the content of this token into $this->text, the type of this token in $this->token, and possibly an * array into $this->tag. * * If this is a BBCODE_TAG token, $this->tag will be an array computed from the tag's contents, like this: * * ``` * [ * '_name' => tag_name, * '_end' => true if this is an end tag (i.e., the name starts with a /) * '_default' => default value (for example, in [url=foo], this is "foo"). * ... * ...all other key => value parameters given in the tag... * ... * ] * ``` */ public function nextToken() { // Handle ungets; if the last token has been "ungotten", just return it again. if ($this->unget) { $this->unget = false; return $this->token; } // Loop until we find a valid (nonempty) token. while (true) { // Did we run out of tokens in the input? if ($this->ptr >= count($this->input)) { $this->text = ""; $this->tag = false; return $this->token = BBCode::BBCODE_EOI; } // Inhale one token, sanitizing away any weird control characters. We // allow \t, \r, and \n to pass through, but that's it. $this->text = preg_replace("/[\\x00-\\x08\\x0B-\\x0C\\x0E-\\x1F]/", "", $this->input[$this->ptr++]); if ($this->verbatim) { // In verbatim mode, we return *everything* as plain text or whitespace. $this->tag = false; if ($this->state == self::BBCODE_LEXSTATE_TEXT) { $this->state = self::BBCODE_LEXSTATE_TAG; $token_type = BBCode::BBCODE_TEXT; } else { // This must be either whitespace, a newline, or a tag. $this->state = self::BBCODE_LEXSTATE_TEXT; switch (ord(substr($this->text, 0, 1))) { case 10: case 13: // Newline. $token_type = BBCode::BBCODE_NL; break; default: // Whitespace. $token_type = BBCode::BBCODE_WS; break; case 45: case 40: case 60: case 91: case 123: // Tag or comment. $token_type = BBCode::BBCODE_TEXT; break; } } if (strlen($this->text) > 0) { return $this->token = $token_type; } } elseif ($this->state == self::BBCODE_LEXSTATE_TEXT) { // Next up is plain text, but only return it if it's nonempty. $this->state = self::BBCODE_LEXSTATE_TAG; $this->tag = false; if (strlen($this->text) > 0) { return $this->token = BBCode::BBCODE_TEXT; } } else { // This must be either whitespace, a newline, or a tag. switch (ord(substr($this->text, 0, 1))) { case 10: case 13: // Newline. $this->tag = false; $this->state = self::BBCODE_LEXSTATE_TEXT; return $this->token = BBCode::BBCODE_NL; case 45: // A rule made of hyphens; return it as a [rule] tag. if (preg_match("/^-----/", $this->text)) { $this->tag = ['_name' => 'rule', '_endtag' => false, '_default' => '']; $this->state = self::BBCODE_LEXSTATE_TEXT; return $this->token = BBCode::BBCODE_TAG; } else { $this->tag = false; $this->state = self::BBCODE_LEXSTATE_TEXT; if (strlen($this->text) > 0) { return $this->token = BBCode::BBCODE_TEXT; } break; } break; default: // Whitespace. $this->tag = false; $this->state = self::BBCODE_LEXSTATE_TEXT; return $this->token = BBCode::BBCODE_WS; case 40: case 60: case 91: case 123: // Tag or comment. This is the most complicated one, because it // needs to be parsed into its component pieces. // See if this is a comment; if so, skip it. if (preg_match($this->pat_comment, $this->text)) { // This is a comment, not a tag, so treat it like it doesn't exist. $this->state = self::BBCODE_LEXSTATE_TEXT; break; } if (preg_match($this->pat_comment2, $this->text)) { // This is a comment, not a tag, so treat it like it doesn't exist. $this->state = self::BBCODE_LEXSTATE_TEXT; break; } // See if this is a [[wiki link]]; if so, convert it into a [wiki="" title=""] tag. if (preg_match($this->pat_wiki, $this->text, $matches)) { $matches += [1 => null, 2 => null]; $this->tag = ['_name' => 'wiki', '_endtag' => false, '_default' => $matches[1], 'title' => $matches[2]]; $this->state = self::BBCODE_LEXSTATE_TEXT; return $this->token = BBCode::BBCODE_TAG; } // Not a comment, so parse it like a tag. $this->tag = $this->decodeTag($this->text); $this->state = self::BBCODE_LEXSTATE_TEXT; return $this->token = ($this->tag['_end'] ? BBCode::BBCODE_ENDTAG : BBCode::BBCODE_TAG); } } } } /** * Un-gets the last token read so that a subsequent call to NextToken() will return it. * * Note that ungetToken() does not switch states when you switch between verbatim mode and standard mode: For * example, if you read a tag, unget the tag, switch to verbatim mode, and then get the next token, you'll get back * a BBCODE_TAG --- exactly what you ungot, not a BBCODE_TEXT token. */ public function ungetToken() { if ($this->token !== BBCode::BBCODE_EOI) { $this->unget = true; } } /** * Peek at the next token, but don't remove it. */ public function peekToken() { $result = $this->nextToken(); if ($this->token !== BBCode::BBCODE_EOI) { $this->unget = true; } return $result; } /** * Save the state of this lexer so it can be restored later. * * The return value from this should be considered opaque. Because PHP uses copy-on-write references, the total * cost of the returned state is relatively small, and the running time of this function (and RestoreState) is very * fast. */ public function saveState() { return [ 'token' => $this->token, 'text' => $this->text, 'tag' => $this->tag, 'state' => $this->state, 'input' => $this->input, 'ptr' => $this->ptr, 'unget' => $this->unget, 'verbatim' => $this->verbatim ]; } /** * Restore the state of this lexer from a saved previous state. * * @param array $state The previous lexer state. */ public function restoreState($state) { if (!is_array($state)) { return; } $state += [ 'token' => null, 'text' => null, 'tag' => null, 'state' => null, 'input' => null, 'ptr' => null, 'unget' => null, 'verbatim' => null ]; $this->token = $state['token']; $this->text = $state['text']; $this->tag = $state['tag']; $this->state = $state['state']; $this->input = $state['input']; $this->ptr = $state['ptr']; $this->unget = $state['unget']; $this->verbatim = $state['verbatim']; } /** * Given a string, if it's surrounded by "quotes" or 'quotes', remove them. * * @param string $string The string to strip. * @return string Returns the string stripped of quotes. */ protected function stripQuotes($string) { if (strlen($string) > 1) { $first = substr($string, 0, 1); $last = substr($string, -1); if ($first === $last && ($first === '"' || $first === "'")) { return substr($string, 1, -1); } } return $string; } /** * Given a tokenized piece of a tag, decide what type of token it is. * * Our return values are: * * - -1 End-of-input (EOI). * - '=' Token is an = sign. * - ' ' Token is whitespace. * - '"' Token is quoted text. * - 'A' Token is unquoted text. * * @param int $ptr The index of {@link $pieces} to examine. * @param array $pieces The pieces array to classify. * @return string Returns the tokenized piece of the tag. */ protected function classifyPiece($ptr, $pieces) { if ($ptr >= count($pieces)) { return -1; // EOI. } $piece = $pieces[$ptr]; if ($piece == '=') { return '='; } elseif (preg_match("/^[\\'\\\"]/", $piece)) { return '"'; } elseif (preg_match("/^[\\x00-\\x20]+$/", $piece)) { return ' '; } else { return 'A'; } } /** * Given a string containing a complete [tag] (including its brackets), break it down into its components and return them as an array. * * @param string $tag The tag to decode. * @return array Returns the array representation of the tag. */ protected function decodeTag($tag) { Debugger::debug("<b>Lexer::InternalDecodeTag:</b> input: ".htmlspecialchars($tag)."<br />\n"); // Create the initial result object. $result = ['_tag' => $tag, '_endtag' => '', '_name' => '', '_hasend' => false, '_end' => false, '_default' => false]; // Strip off the [brackets] around the tag, leaving just its content. $tag = substr($tag, 1, strlen($tag) - 2); // The starting bracket *must* be followed by a non-whitespace character. $ch = ord(substr($tag, 0, 1)); if ($ch >= 0 && $ch <= 32) { return $result; } // Break it apart into words, quoted text, whitespace, and equal signs. $pieces = preg_split( "/(\\\"[^\\\"]+\\\"|\\'[^\\']+\\'|=|[\\x00-\\x20]+)/", $tag, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ); $ptr = 0; // Handle malformed (empty) tags correctly. if (count($pieces) < 1) { return $result; } // The first piece should be the tag name, whatever it is. If it starts with a / // we remove the / and mark it as an end tag. if (!empty($pieces[$ptr]) && substr($pieces[$ptr], 0, 1) === '/') { $result['_name'] = strtolower(substr($pieces[$ptr++], 1)); $result['_end'] = true; } else { $result['_name'] = strtolower($pieces[$ptr++]); $result['_end'] = false; } // Skip whitespace after the tag name. while (($type = $this->classifyPiece($ptr, $pieces)) == ' ') { $ptr++; } $params = []; // If the next piece is an equal sign, then the tag's default value follows. if ($type != '=') { $result['_default'] = false; $params[] = ['key' => '', 'value' => '']; } else { $ptr++; // Skip whitespace after the initial equal-sign. while (($type = $this->classifyPiece($ptr, $pieces)) == ' ') { $ptr++; } // Examine the next (real) piece, and see if it's quoted; if not, we need to // use heuristics to guess where the default value begins and ends. if ($type == "\"") { $value = $this->stripQuotes($pieces[$ptr++]); } else { // Collect pieces going forward until we reach an = sign or the end of the // tag; then rewind before whatever comes before the = sign, and everything // between here and there becomes the default value. This allows tags like // [font=Times New Roman size=4] to make sense even though the font name is // not quoted. Note, however, that there's a special initial case, where // any equal-signs before whitespace are considered to be part of the parameter // as well; this allows an ugly tag like [url=http://foo?bar=baz target=my_window] // to behave in a way that makes (tolerable) sense. $after_space = false; $start = $ptr; while (($type = $this->classifyPiece($ptr, $pieces)) != -1) { if ($type == ' ') { $after_space = true; } if ($type == '=' && $after_space) { break; } $ptr++; } if ($type == -1) { $ptr--; } // We've now found the first (appropriate) equal-sign after the start of the // default value. (In the example above, that's the "=" after "target".) We // now have to rewind back to the last whitespace to find where the default // value ended. if ($type == '=') { // Rewind before = sign. $ptr--; // Rewind before any whitespace before = sign. while ($ptr > $start && $this->classifyPiece($ptr, $pieces) == ' ') { $ptr--; } // Rewind before any text elements before that. while ($ptr > $start && $this->classifyPiece($ptr, $pieces) != ' ') { $ptr--; } } // The default value is everything from $start to $ptr, inclusive. $value = ""; for (; $start <= $ptr; $start++) { if ($this->classifyPiece($start, $pieces) == ' ') { $value .= " "; } else { $value .= $this->stripQuotes($pieces[$start]); } } $value = trim($value); $ptr++; } $result['_default'] = $value; $params[] = ['key' => '', 'value' => $value]; } // The rest of the tag is composed of either floating keys or key=value pairs, so walk through // the tag and collect them all. Again, we have the nasty special case where an equal sign // in a parameter but before whitespace counts as part of that parameter. while (($type = $this->classifyPiece($ptr, $pieces)) != -1) { // Skip whitespace before the next key name. while ($type === ' ') { $ptr++; $type = $this->classifyPiece($ptr, $pieces); } // Decode the key name. if ($type === 'A' || $type === '"') { if (isset($pieces[$ptr])) { $key = strtolower($this->stripQuotes($pieces[$ptr])); } else { $key = ''; } $ptr++; } elseif ($type === '=') { $ptr++; continue; } elseif ($type === -1) { break; } // Skip whitespace after the key name. while (($type = $this->classifyPiece($ptr, $pieces)) == ' ') { $ptr++; } // If an equal-sign follows, we need to collect a value. Otherwise, we // take the key itself as the value. if ($type !== '=') { $value = $this->stripQuotes($key); } else { $ptr++; // Skip whitespace after the equal sign. while (($type = $this->classifyPiece($ptr, $pieces)) == ' ') { $ptr++; } if ($type === '"') { // If we get a quoted value, take that as the only value. $value = $this->stripQuotes($pieces[$ptr++]); } elseif ($type !== -1) { // If we get a non-quoted value, consume non-quoted values // until we reach whitespace. $value = $pieces[$ptr++]; while (($type = $this->classifyPiece($ptr, $pieces)) != -1 && $type != ' ') { $value .= $pieces[$ptr++]; } } else { $value = ""; } } // Record this in the associative array if it's a legal public identifier name. // Legal *public* identifier names must *not* begin with an underscore. if (substr($key, 0, 1) !== '_') { $result[$key] = $value; } // Record this in the parameter list always. $params[] = ['key' => $key, 'value' => $value]; } // Add the parameter list as a member of the associative array. $result['_params'] = $params; // In debugging modes, output the tag as we collected it. Debugger::debug("<b>Lexer::InternalDecodeTag:</b> output: "); Debugger::debug(htmlspecialchars(print_r($result, true))."<br />\n"); // Save the resulting parameters, and return the whole shebang. return $result; } }
| ver. 1.1 | |
.
| PHP 8.4.18 | Ð“ÐµÐ½ÐµÑ€Ð°Ñ†Ð¸Ñ Ñтраницы: 0 |
proxy
|
phpinfo
|
ÐаÑтройка