Файловый менеджер - Редактировать - /var/www/html/Utils.zip
Ðазад
PK ! DyG�B2 B2 PHPUtils.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Utils; use Wikimedia\Assert\Assert; use Wikimedia\Assert\UnreachableException; /** * This file contains Parsoid-independent PHP helper functions. * Over time, more functions can be migrated out of various other files here. * @module */ class PHPUtils { /** * Convert a counter to a Base64 encoded string. * Padding is stripped. /,+ are replaced with _,- respectively. * Warning: Max integer is 2^31 - 1 for bitwise operations. * @param int $n * @return string */ public static function counterToBase64( int $n ): string { $str = ''; do { $str = chr( $n & 0xff ) . $str; $n >>= 8; } while ( $n > 0 ); return rtrim( strtr( base64_encode( $str ), '+/', '-_' ), '=' ); } /** * FIXME: Copied from FormatJson.php in core * * Characters problematic in JavaScript. * * @note These are listed in ECMA-262 (5.1 Ed.), §7.3 Line Terminators along with U+000A (LF) * and U+000D (CR). However, PHP already escapes LF and CR according to RFC 4627. */ private const BAD_CHARS = [ "\u{2028}", // U+2028 LINE SEPARATOR "\u{2029}", // U+2029 PARAGRAPH SEPARATOR ]; /** * FIXME: Copied from FormatJson.php in core * * Escape sequences for characters listed in FormatJson::BAD_CHARS. */ private const BAD_CHARS_ESCAPED = [ '\u2028', // U+2028 LINE SEPARATOR '\u2029', // U+2029 PARAGRAPH SEPARATOR ]; /** * FIXME: Core has FormatJson::encode that does a more comprehensive job * * json_encode wrapper function * - unscapes slashes and unicode * * @param mixed $o * @return string */ public static function jsonEncode( $o ): string { $str = json_encode( $o, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE | JSON_THROW_ON_ERROR ); $str = str_replace( self::BAD_CHARS, self::BAD_CHARS_ESCAPED, $str ); return $str; } /** * FIXME: Core has FormatJson::parse that does a more comprehensive job * json_decode wrapper function * @param string $str String to decode into the json object * @param bool $assoc Controls whether to parse as an an associative array - defaults to true * @return mixed */ public static function jsonDecode( string $str, bool $assoc = true ) { return json_decode( $str, $assoc ); } /** * Convert array to associative array usable as a read-only Set. * * @param array $a * @return array */ public static function makeSet( array $a ): array { return array_fill_keys( $a, true ); } /** * Helper to get last item of the array * @param mixed[] $a * @return mixed */ public static function lastItem( array $a ) { // Tim Starling recommends not using end() for perf reasons // since apparently it can be O(n) where the refcount on the // array is > 1. // // Note that end() is usable in non-array scenarios. But, in our case, // we are almost always dealing with arrays, so this helper probably // better for cases where we aren't sure the array isn't shared. return $a[count( $a ) - 1] ?? null; } /** * Append an array to an accumulator using the most efficient method * available. Makes sure that accumulation is O(n). * * See https://w.wiki/3zvE * * @param array &$dest Destination array * @param array $source Array to merge */ public static function pushArray( array &$dest, array $source ): void { if ( count( $dest ) < count( $source ) ) { $dest = array_merge( $dest, $source ); } else { foreach ( $source as $item ) { $dest[] = $item; } } } /** * Return a substring, asserting that it is valid UTF-8. * By default we assume the full string was valid UTF-8, which allows * us to look at the first and last bytes to make this check. * You can check the entire string if you are feeling paranoid; it * will take O(N) time (where N is the length of the substring) but * so does the substring operation. * * If the substring would start beyond the end of the string or * end before the start of the string, then this function will * return the empty string (as would JavaScript); note that the * native `substr` would return `false` in this case. * * Using this helper instead of native `substr` is * useful during the PHP port to verify that we don't break up * Unicode codepoints by the switch from JavaScript UCS-2 offsets * to PHP UTF-8 byte offsets. * * @param string $s The (sub)string to check * @param int $start The starting offset (in bytes). If negative, the * offset is counted from the end of the string. * @param ?int $length (optional) The maximum length of the returned * string. If negative, the end position is counted from the end of * the string. * @param bool $checkEntireString Whether to do a slower verification * of the entire string, not just the edges. Defaults to false. * @return string The checked substring */ public static function safeSubstr( string $s, int $start, ?int $length = null, bool $checkEntireString = false ): string { if ( $length === null ) { $ss = substr( $s, $start ); } else { $ss = substr( $s, $start, $length ); } if ( $ss === false ) { $ss = ''; } if ( strlen( $ss ) === 0 ) { return $ss; } $firstChar = ord( $ss ); Assert::invariant( ( $firstChar & 0xC0 ) !== 0x80, 'Bad UTF-8 at start of string' ); $i = 0; // This next loop won't step off the front of the string because we've // already asserted that the first character is not 10xx xxxx do { $i--; Assert::invariant( $i > -5, // This should never happen, assuming the original string // was valid UTF-8 'Bad UTF-8 at end of string (>4 byte sequence)' ); $lastChar = ord( $ss[$i] ); } while ( ( $lastChar & 0xC0 ) === 0x80 ); if ( ( $lastChar & 0x80 ) === 0 ) { Assert::invariant( // This shouldn't happen, assuming original string was valid $i === -1, 'Bad UTF-8 at end of string (1 byte sequence)' ); } elseif ( ( $lastChar & 0xE0 ) === 0xC0 ) { Assert::invariant( $i === -2, 'Bad UTF-8 at end of string (2 byte sequence)' ); } elseif ( ( $lastChar & 0xF0 ) === 0xE0 ) { Assert::invariant( $i === -3, 'Bad UTF-8 at end of string (3 byte sequence)' ); } elseif ( ( $lastChar & 0xF8 ) === 0xF0 ) { Assert::invariant( $i === -4, 'Bad UTF-8 at end of string (4 byte sequence)' ); } else { throw new UnreachableException( // This shouldn't happen, assuming original string was valid 'Bad UTF-8 at end of string' ); } if ( $checkEntireString ) { // We did the head/tail checks first because they give better // diagnostics in the common case where we broke UTF-8 by // the substring operation. self::assertValidUTF8( $ss ); } return $ss; } /** * Helper for verifying a valid UTF-8 encoding. Using * safeSubstr() is a more efficient way of doing this check in * most places, where you can assume that the original string was * valid UTF-8. This function does a complete traversal of the * string, in time proportional to the length of the string. * * @param string $s The string to check. */ public static function assertValidUTF8( string $s ): void { // Slow complete O(N) check for UTF-8 validity $r = preg_match( '//u', $s ); Assert::invariant( $r === 1, 'Bad UTF-8 (full string verification)' ); } /** * Helper for joining pieces of regular expressions together. This * safely strips delimiters from regular expression strings, while * ensuring that the result is safely escaped for the new delimiter * you plan to use (see the `$delimiter` argument to `preg_quote`). * Note that using a meta-character for the new delimiter can lead to * unexpected results; for example, if you use `!` then escaping * `(?!foo)` will break the regular expression. * * @param string $re The regular expression to strip * @param ?string $newDelimiter Optional delimiter which will be * used when recomposing this stripped regular expression into a * new regular expression. * @return string The regular expression without delimiters or flags */ public static function reStrip( string $re, ?string $newDelimiter = null ): string { static $delimiterPairs = [ '(' => ')', '[' => ']', '{' => '}', '<' => '>', ]; // Believe it or not, PHP allows leading whitespace in the $re // tested with C's "isspace", which is [ \f\n\r\t\v] $re = preg_replace( '/^[ \f\n\r\t\v]+/', '', $re ); Assert::invariant( strlen( $re ) > 0, "empty regexp" ); $startDelimiter = $re[0]; // PHP actually supports balanced delimiters (ie open paren on left // and close paren on right). $endDelimiter = $delimiterPairs[$startDelimiter] ?? $startDelimiter; $endDelimiterPos = strrpos( $re, $endDelimiter ); Assert::invariant( $endDelimiterPos !== false && $endDelimiterPos > 0, "can't find end delimiter" ); $flags = substr( $re, $endDelimiterPos + 1 ); Assert::invariant( preg_match( '/^[imsxADSUXJu \n]*$/D', $flags ) === 1, "unexpected flags" ); $stripped = substr( $re, 1, $endDelimiterPos - 1 ); if ( $newDelimiter === null || $startDelimiter === $newDelimiter || $endDelimiter === $newDelimiter ) { return $stripped; // done! } $newCloseDelimiter = $delimiterPairs[$startDelimiter] ?? $startDelimiter; // escape the new delimiter preg_match_all( '/[^\\\\]|\\\\./s', $stripped, $matches ); return implode( '', array_map( static function ( $c ) use ( $newDelimiter, $newCloseDelimiter ) { return ( $c === $newDelimiter || $c === $newCloseDelimiter ) ? ( '\\' . $c ) : $c; }, $matches[0] ) ); } /** * JS-compatible encodeURIComponent function * FIXME: See T221147 (for a post-port update) * * @param string $str * @return string */ public static function encodeURIComponent( string $str ): string { $revert = [ '%21' => '!', '%2A' => '*', '%27' => "'", '%28' => '(', '%29' => ')' ]; return strtr( rawurlencode( $str ), $revert ); } /** * Sort keys in an array, recursively, for better reproducibility. * (This is especially useful before serializing as JSON.) * * @param mixed &$array */ public static function sortArray( &$array ): void { if ( !is_array( $array ) ) { return; } ksort( $array ); foreach ( $array as $k => $v ) { self::sortArray( $array[$k] ); } } /** * Convert an iterable to an array. * * This function is similar to *but not the same as* the built-in * iterator_to_array, because arrays are iterable but not Traversable! * * This function is also present in the wmde/iterable-functions library, * but it's short enough that we don't need to pull in an entire new * dependency here. * * @see https://stackoverflow.com/questions/44587973/php-iterable-to-array-or-traversable * @see https://github.com/wmde/iterable-functions/blob/master/src/functions.php * * @phan-template T * @param iterable<T> $iterable * @return array<T> */ public static function iterable_to_array( iterable $iterable ): array { // phpcs:ignore MediaWiki.NamingConventions.LowerCamelFunctionsName.FunctionName,Generic.Files.LineLength.TooLong if ( is_array( $iterable ) ) { return $iterable; } '@phan-var \Traversable $iterable'; // @var \Traversable $iterable return iterator_to_array( $iterable ); } /** * Indicate that the code which calls this function is intended to be * unreachable. * * This is a workaround for T247093; this has been moved upstream * into wikimedia/assert. * * @param string $reason * @return never * @deprecated Just throw an UnreachableException instead. */ public static function unreachable( string $reason = "should never happen" ) { throw new UnreachableException( $reason ); } /** * If a string starts with a given prefix, remove the prefix. Otherwise, * return the original string. Like preg_replace( "/^$prefix/", '', $subject ) * except about 1.14x faster in the replacement case and 2x faster in * the no-op case. * * Note: adding type declarations to the parameters adds an overhead of 3%. * The benchmark above was without type declarations. * * @param string $subject * @param string $prefix * @return string */ public static function stripPrefix( $subject, $prefix ) { if ( str_starts_with( $subject, $prefix ) ) { return substr( $subject, strlen( $prefix ) ); } else { return $subject; } } /** * If a string ends with a given suffix, remove the suffix. Otherwise, * return the original string. Like preg_replace( "/$suffix$/", '', $subject ) * except faster. * * @param string $subject * @param string $suffix * @return string */ public static function stripSuffix( $subject, $suffix ) { if ( str_ends_with( $subject, $suffix ) ) { return substr( $subject, 0, -strlen( $suffix ) ); } else { return $subject; } } } PK ! {z0�q� q� WTUtils.phpnu �Iw�� <?php declare( strict_types = 1 ); // Suppress UnusedPluginSuppression because // Phan on PHP 7.4 and PHP 8.1 need different suppressions // @phan-file-suppress UnusedPluginSuppression,UnusedPluginFileSuppression namespace Wikimedia\Parsoid\Utils; use DOMException; use Wikimedia\Assert\UnreachableException; use Wikimedia\Bcp47Code\Bcp47Code; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\DOM\Comment; use Wikimedia\Parsoid\DOM\Document; use Wikimedia\Parsoid\DOM\DocumentFragment; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\DOM\Text; use Wikimedia\Parsoid\Ext\ExtensionTagHandler; use Wikimedia\Parsoid\NodeData\DataParsoid; use Wikimedia\Parsoid\NodeData\I18nInfo; use Wikimedia\Parsoid\NodeData\TempData; use Wikimedia\Parsoid\Tokens\CommentTk; use Wikimedia\Parsoid\Wikitext\Consts; use Wikimedia\Parsoid\Wt2Html\Frame; /** * These utilites pertain to querying / extracting / modifying wikitext information from the DOM. */ class WTUtils { private const FIRST_ENCAP_REGEXP = '#(?:^|\s)(mw:(?:Transclusion|Param|LanguageVariant|Extension(/\S+)))(?=$|\s)#D'; /** * Regex corresponding to FIRST_ENCAP_REGEXP, but excluding extensions. If FIRST_ENCAP_REGEXP is * updated, this one should be as well. */ private const NON_EXTENSION_ENCAP_REGEXP = '#(?:^|\s)(mw:(?:Transclusion|Param|LanguageVariant))(?=$|\s)#D'; /** * Regexp for checking marker metas typeofs representing * transclusion markup or template param markup. */ private const TPL_META_TYPE_REGEXP = '#^mw:(?:Transclusion|Param)(?:/End)?$#D'; /** * Regexp for checking marker metas typeofs representing * annotation markup */ public const ANNOTATION_META_TYPE_REGEXP = '#^mw:(?:Annotation/([\w\d]+))(?:/End)?$#uD'; /** * Check whether a node's data-parsoid object includes * an indicator that the original wikitext was a literal * HTML element (like table or p) * * @param DataParsoid $dp * @return bool */ public static function hasLiteralHTMLMarker( DataParsoid $dp ): bool { return isset( $dp->stx ) && $dp->stx === 'html'; } /** * Run a node through {@link #hasLiteralHTMLMarker}. * @param ?Node $node * @return bool */ public static function isLiteralHTMLNode( ?Node $node ): bool { return $node instanceof Element && self::hasLiteralHTMLMarker( DOMDataUtils::getDataParsoid( $node ) ); } /** * @param Node $node * @return bool */ public static function isZeroWidthWikitextElt( Node $node ): bool { return isset( Consts::$ZeroWidthWikitextTags[DOMCompat::nodeName( $node )] ) && !self::isLiteralHTMLNode( $node ); } /** * Is `$node` a block node that is also visible in wikitext? * An example of an invisible block node is a `<p>`-tag that * Parsoid generated, or a `<ul>`, `<ol>` tag. * * @param Node $node * @return bool */ public static function isBlockNodeWithVisibleWT( Node $node ): bool { return DOMUtils::isWikitextBlockNode( $node ) && !self::isZeroWidthWikitextElt( $node ); } /** * Helper functions to detect when an A-$node uses [[..]]/[..]/... style * syntax (for wikilinks, ext links, url links). rel-type is not sufficient * anymore since mw:ExtLink is used for all the three link syntaxes. * * @param Element $node * @return bool */ public static function isATagFromWikiLinkSyntax( Element $node ): bool { if ( DOMCompat::nodeName( $node ) !== 'a' ) { return false; } $dp = DOMDataUtils::getDataParsoid( $node ); return DOMUtils::hasRel( $node, 'mw:WikiLink' ) || ( isset( $dp->stx ) && $dp->stx !== "url" && $dp->stx !== "magiclink" ); } /** * Helper function to detect when an A-node uses ext-link syntax. * rel attribute is not sufficient anymore since mw:ExtLink is used for * multiple link types * * @param Element $node * @return bool */ public static function isATagFromExtLinkSyntax( Element $node ): bool { if ( DOMCompat::nodeName( $node ) !== 'a' ) { return false; } $dp = DOMDataUtils::getDataParsoid( $node ); return DOMUtils::hasRel( $node, 'mw:ExtLink' ) && ( !isset( $dp->stx ) || ( $dp->stx !== "url" && $dp->stx !== "magiclink" ) ); } /** * Helper function to detect when an A-node uses url-link syntax. * rel attribute is not sufficient anymore since mw:ExtLink is used for * multiple link types * * @param Element $node * @return bool */ public static function isATagFromURLLinkSyntax( Element $node ): bool { if ( DOMCompat::nodeName( $node ) !== 'a' ) { return false; } $dp = DOMDataUtils::getDataParsoid( $node ); return DOMUtils::hasRel( $node, 'mw:ExtLink' ) && isset( $dp->stx ) && $dp->stx === "url"; } /** * Helper function to detect when an A-node uses magic-link syntax. * rel attribute is not sufficient anymore since mw:ExtLink is used for * multiple link types * * @param Element $node * @return bool */ public static function isATagFromMagicLinkSyntax( Element $node ): bool { if ( DOMCompat::nodeName( $node ) !== 'a' ) { return false; } $dp = DOMDataUtils::getDataParsoid( $node ); return DOMUtils::hasRel( $node, 'mw:ExtLink' ) && isset( $dp->stx ) && $dp->stx === 'magiclink'; } /** * Check whether a node's typeof indicates that it is a template expansion. * * @param Element $node * @return ?string The matched type, or null if no match. */ public static function matchTplType( Element $node ): ?string { return DOMUtils::matchTypeOf( $node, self::TPL_META_TYPE_REGEXP ); } /** * Check whether a typeof indicates that it signifies an * expanded attribute. * * @param Element $node * @return bool */ public static function hasExpandedAttrsType( Element $node ): bool { return DOMUtils::matchTypeOf( $node, '/^mw:ExpandedAttrs(\/\S+)*$/' ) !== null; } /** * Check whether a node is a meta tag that signifies a template expansion. * * @param Node $node * @return bool */ public static function isTplMarkerMeta( Node $node ): bool { return DOMUtils::matchNameAndTypeOf( $node, 'meta', self::TPL_META_TYPE_REGEXP ) !== null; } /** * Check whether a node is a meta signifying the start of a template expansion. * * @param Node $node * @return bool */ public static function isTplStartMarkerMeta( Node $node ): bool { $t = DOMUtils::matchNameAndTypeOf( $node, 'meta', self::TPL_META_TYPE_REGEXP ); return $t !== null && !str_ends_with( $t, '/End' ); } /** * Check whether a node is a meta signifying the end of a template expansion. * * @param Node $node * @return bool */ public static function isTplEndMarkerMeta( Node $node ): bool { $t = DOMUtils::matchNameAndTypeOf( $node, 'meta', self::TPL_META_TYPE_REGEXP ); return $t !== null && str_ends_with( $t, '/End' ); } /** * Find the first wrapper element of encapsulated content. * @param Node $node * @return Element|null */ public static function findFirstEncapsulationWrapperNode( Node $node ): ?Element { if ( !self::isEncapsulatedDOMForestRoot( $node ) ) { return null; } /** @var Element $node */ DOMUtils::assertElt( $node ); $about = DOMCompat::getAttribute( $node, 'about' ); $prev = $node; do { $node = $prev; $prev = DiffDOMUtils::previousNonDeletedSibling( $node ); } while ( $prev instanceof Element && DOMCompat::getAttribute( $prev, 'about' ) === $about ); // NOTE: findFirstEncapsulationWrapperNode can be called by code // even before templates have been fully encapsulated everywhere. // ProcessTreeBuilderFixups::removeAutoInsertedEmptyTags is the main // culprit here and it makes the contract for this helper murky // by hiding potential brokenness since this should never return null // once all templates have been encapsulated! $elt = self::isFirstEncapsulationWrapperNode( $node ) ? $node : null; '@phan-var ?Element $elt'; // @var ?Element $elt return $elt; } /** * This tests whether a DOM node is a new node added during an edit session * or an existing node from parsed wikitext. * * As written, this function can only be used on non-template/extension content * or on the top-level nodes of template/extension content. This test will * return the wrong results on non-top-level $nodes of template/extension content. * * @param Node $node * @return bool */ public static function isNewElt( Node $node ): bool { // We cannot determine newness on text/comment $nodes. if ( !( $node instanceof Element ) ) { return false; } // For template/extension content, newness should be // checked on the encapsulation wrapper $node. $node = self::findFirstEncapsulationWrapperNode( $node ) ?? $node; return DOMDataUtils::getDataParsoid( $node )->getTempFlag( TempData::IS_NEW ); } /** * Check whether a pre is caused by indentation in the original wikitext. * @param Node $node * @return bool */ public static function isIndentPre( Node $node ): bool { return DOMCompat::nodeName( $node ) === "pre" && !self::isLiteralHTMLNode( $node ); } /** * @param Node $node * @return bool */ public static function isInlineMedia( Node $node ): bool { return self::isGeneratedFigure( $node ) && DOMCompat::nodeName( $node ) !== 'figure'; // span, figure-inline } /** * @param Node $node * @return bool */ public static function isGeneratedFigure( Node $node ): bool { // TODO: Remove "Image|Video|Audio" when version 2.4.0 of the content // is no longer supported return DOMUtils::matchTypeOf( $node, '#^mw:(File|Image|Video|Audio)($|/)#D' ) !== null; } /** * Find how much offset is necessary for the DSR of an * indent-originated pre tag. * * @param Node $textNode * @return int */ public static function indentPreDSRCorrection( Node $textNode ): int { // NOTE: This assumes a text-node and doesn't check that it is one. // // FIXME: Doesn't handle text nodes that are not direct children of the pre if ( self::isIndentPre( $textNode->parentNode ) ) { $numNLs = substr_count( $textNode->nodeValue, "\n" ); if ( $textNode->parentNode->lastChild === $textNode ) { // We dont want the trailing newline of the last child of the pre // to contribute a pre-correction since it doesn't add new content // in the pre-node after the text if ( str_ends_with( $textNode->nodeValue, "\n" ) ) { $numNLs--; } } return $numNLs; } else { return 0; } } /** * Check if $node is a root in an encapsulated DOM forest. * * @param Node $node * @return bool */ public static function isEncapsulatedDOMForestRoot( Node $node ): bool { $about = $node instanceof Element ? DOMCompat::getAttribute( $node, 'about' ) : null; if ( $about !== null ) { // FIXME: Ensure that our DOM spec clarifies this expectation return Utils::isParsoidObjectId( $about ); } else { return false; } } /** * Does $node represent a redirect link? */ public static function isRedirectLink( ?Node $node ): bool { return $node instanceof Element && DOMCompat::nodeName( $node ) === 'link' && DOMUtils::matchRel( $node, '#\bmw:PageProp/redirect\b#' ) !== null; } /** * Does $node represent a category link? */ public static function isCategoryLink( ?Node $node ): bool { return $node instanceof Element && DOMCompat::nodeName( $node ) === 'link' && DOMUtils::matchRel( $node, '#\bmw:PageProp/Category\b#' ) !== null; } /** * Does $node represent a link that is sol-transparent? */ public static function isSolTransparentLink( ?Node $node ): bool { return $node instanceof Element && DOMCompat::nodeName( $node ) === 'link' && DOMUtils::matchRel( $node, TokenUtils::SOL_TRANSPARENT_LINK_REGEX ) !== null; } /** * Check if '$node' emits wikitext that is sol-transparent in wikitext form. * This is a test for wikitext that doesn't introduce line breaks. * * Comment, whitespace text $nodes, category links, redirect links, behavior * switches, and include directives currently satisfy this definition. * * This should come close to matching TokenUtils.isSolTransparent() * * @param Node $node * @return bool */ public static function emitsSolTransparentSingleLineWT( Node $node ): bool { if ( $node instanceof Text ) { // NB: We differ here to meet the nl condition. return (bool)preg_match( '/^[ \t]*$/D', $node->nodeValue ); } elseif ( self::isRenderingTransparentNode( $node ) ) { // NB: The only metas in a DOM should be for behavior switches and // include directives, other than explicit HTML meta tags. This // differs from our counterpart in Util where ref meta tokens // haven't been expanded to spans yet. return true; } else { return false; } } /** * This is the span added to headings to add fallback ids for when legacy * and HTML5 ids don't match up. This prevents broken links to legacy ids. * * @param Node $node * @return bool */ public static function isFallbackIdSpan( Node $node ): bool { return DOMUtils::hasNameAndTypeOf( $node, 'span', 'mw:FallbackId' ); } /** * These are primarily 'metadata'-like $nodes that don't show up in output rendering. * - In Parsoid output, they are represented by link/meta tags. * - In the PHP parser, they are completely stripped from the input early on. * Because of this property, these rendering-transparent $nodes are also * SOL-transparent for the purposes of parsing behavior. * * @param Node $node * @return bool */ public static function isRenderingTransparentNode( Node $node ): bool { // FIXME: Can we change this entire thing to // $node instanceof Comment || // DOMUtils::getDataParsoid($node).stx !== 'html' && // (DOMCompat::nodeName($node) === 'meta' || DOMCompat::nodeName($node) === 'link') // return $node instanceof Comment || self::isSolTransparentLink( $node ) || ( // Catch-all for everything else. $node instanceof Element && DOMCompat::nodeName( $node ) === 'meta' && !self::isMarkerAnnotation( $node ) && ( DOMDataUtils::getDataParsoid( $node )->stx ?? '' ) !== 'html' ) || self::isFallbackIdSpan( $node ); } /** * Is $node nested inside a table tag that uses HTML instead of native * wikitext? * * @param Node $node * @return bool */ public static function inHTMLTableTag( Node $node ): bool { $p = $node->parentNode; while ( DOMUtils::isTableTag( $p ) ) { if ( self::isLiteralHTMLNode( $p ) ) { return true; } elseif ( DOMCompat::nodeName( $p ) === 'table' ) { // Don't cross <table> boundaries return false; } $p = $p->parentNode; } return false; } /** * Is $node the first wrapper element of encapsulated content? * * @param Node $node * @return bool */ public static function isFirstEncapsulationWrapperNode( Node $node ): bool { return DOMUtils::matchTypeOf( $node, self::FIRST_ENCAP_REGEXP ) !== null; } /** * Is $node the first wrapper element of extension content? * * @param Node $node * @return bool */ public static function isFirstExtensionWrapperNode( Node $node ): bool { return DOMUtils::matchTypeOf( $node, "#mw:Extension/#" ) !== null; } /** * Checks whether a first encapsulation wrapper node is encapsulating an extension * that outputs MediaWiki Core DOM Spec HTML (https://www.mediawiki.org/wiki/Specs/HTML) * @param Node $node * @param Env $env * @return bool */ public static function isExtensionOutputtingCoreMwDomSpec( Node $node, Env $env ): bool { if ( DOMUtils::matchTypeOf( $node, self::NON_EXTENSION_ENCAP_REGEXP ) !== null ) { return false; } $extTagName = self::getExtTagName( $node ); $extConfig = $env->getSiteConfig()->getExtTagConfig( $extTagName ); $htmlType = $extConfig['options']['outputHasCoreMwDomSpecMarkup'] ?? null; return $htmlType === true; } /** * Is $node an encapsulation wrapper elt? * * All root-level $nodes of generated content are considered * encapsulation wrappers and share an about-id. * * @param Node $node * @return bool */ public static function isEncapsulationWrapper( Node $node ): bool { // True if it has an encapsulation type or while walking backwards // over elts with identical about ids, we run into a $node with an // encapsulation type. if ( !( $node instanceof Element ) ) { return false; } return self::findFirstEncapsulationWrapperNode( $node ) !== null; } /** * Is $node a DOMFragment wrapper? * * @param Node $node * @return bool */ public static function isDOMFragmentWrapper( Node $node ): bool { // See TokenUtils::hasDOMFragmentType return DOMUtils::matchTypeOf( $node, '#^mw:DOMFragment(/sealed/\w+)?$#D' ) !== null; } /** * Is $node a sealed DOMFragment of a specific type? * * @param Node $node * @param string $type * @return bool */ public static function isSealedFragmentOfType( Node $node, string $type ): bool { return DOMUtils::hasTypeOf( $node, "mw:DOMFragment/sealed/$type" ); } /** * Is $node a Parsoid-generated <section> tag? * * @param Node $node * @return bool */ public static function isParsoidSectionTag( Node $node ): bool { return $node instanceof Element && DOMCompat::nodeName( $node ) === 'section' && $node->hasAttribute( 'data-mw-section-id' ); } /** * Is the $node from extension content? * @param Node $node * @param ?string $extType If non-null, checks for that specific extension * @return bool */ public static function fromExtensionContent( Node $node, ?string $extType = null ): bool { $re = $extType ? "#mw:Extension/$extType#" : "#mw:Extension/\w+#"; while ( $node && !DOMUtils::atTheTop( $node ) ) { if ( DOMUtils::matchTypeOf( $node, $re ) ) { return true; } $node = $node->parentNode; } return false; } /** * Is $node from encapsulated (template, extension, etc.) content? * @param Node $node * @return bool */ public static function fromEncapsulatedContent( Node $node ): bool { while ( $node && !DOMUtils::atTheTop( $node ) ) { if ( self::findFirstEncapsulationWrapperNode( $node ) !== null ) { return true; } $node = $node->parentNode; } return false; } /** * Compute, when possible, the wikitext source for a $node in * an environment env. Returns null if the source cannot be * extracted. * @param Frame $frame * @param Element $node * @return string|null */ public static function getWTSource( Frame $frame, Element $node ): ?string { $dp = DOMDataUtils::getDataParsoid( $node ); $dsr = $dp->dsr ?? null; // FIXME: We could probably change the null return to '' // Just need to verify that code that uses this won't break return Utils::isValidDSR( $dsr ) ? $dsr->substr( $frame->getSrcText() ) : null; } /** * Gets all siblings that follow '$node' that have an 'about' as * their about id. * * This is used to fetch transclusion/extension content by using * the about-id as the key. This works because * transclusion/extension content is a forest of dom-trees formed * by adjacent dom-nodes. This is the contract that template * encapsulation, dom-reuse, and VE code all have to abide by. * * The only exception to this adjacency rule is IEW nodes in * fosterable positions (in tables) which are not span-wrapped to * prevent them from getting fostered out. * * @param Node $node * @param ?string $about * @return Node[] */ public static function getAboutSiblings( Node $node, ?string $about ): array { $nodes = [ $node ]; if ( $about === null ) { return $nodes; } $node = $node->nextSibling; while ( $node && ( ( $node instanceof Element && DOMCompat::getAttribute( $node, 'about' ) === $about ) || ( DOMUtils::isFosterablePosition( $node ) && DOMUtils::isIEW( $node ) ) ) ) { $nodes[] = $node; $node = $node->nextSibling; } // Remove already consumed trailing IEW, if any while ( count( $nodes ) > 0 && DOMUtils::isIEW( $nodes[count( $nodes ) - 1] ) ) { array_pop( $nodes ); } return $nodes; } /** * This function is only intended to be used on encapsulated $nodes * (Template/Extension/Param content). * * Given a '$node' that has an about-id, it is assumed that it is generated * by templates or extensions. This function skips over all * following content nodes and returns the first non-template node * that follows it. * * @param Node $node * @return Node|null */ public static function skipOverEncapsulatedContent( Node $node ): ?Node { $about = $node instanceof Element ? DOMCompat::getAttribute( $node, 'about' ) : null; if ( $about !== null ) { // Guaranteed not to be empty. It will at least include $node. $aboutSiblings = self::getAboutSiblings( $node, $about ); return end( $aboutSiblings )->nextSibling; } else { return $node->nextSibling; } } /** * Comment encoding/decoding. * * * Some relevant phab tickets: T94055, T70146, T60184, T95039 * * The wikitext comment rule is very simple: <!-- starts a comment, * and --> ends a comment. This means we can have almost anything as the * contents of a comment (except the string "-->", but see below), including * several things that are not valid in HTML5 comments: * * * For one, the html5 comment parsing algorithm [0] leniently accepts * --!> as a closing comment tag, which differs from the php+tidy combo. * * * If the comment's data matches /^-?>/, html5 will end the comment. * For example, <!-->stuff<--> breaks up as * <!--> (the comment) followed by, stuff<--> (as text). * * * Finally, comment data shouldn't contain two consecutive hyphen-minus * characters (--), nor end in a hyphen-minus character (/-$/) as defined * in the spec [1]. * * We work around all these problems by using HTML entity encoding inside * the comment body. The characters -, >, and & must be encoded in order * to prevent premature termination of the comment by one of the cases * above. Encoding other characters is optional; all entities will be * decoded during wikitext serialization. * * In order to allow *arbitrary* content inside a wikitext comment, * including the forbidden string "-->" we also do some minimal entity * decoding on the wikitext. We are also limited by our inability * to encode DSR attributes on the comment $node, so our wikitext entity * decoding must be 1-to-1: that is, there must be a unique "decoded" * string for every wikitext sequence, and for every decoded string there * must be a unique wikitext which creates it. * * The basic idea here is to replace every string ab*c with the string with * one more b in it. This creates a string with no instance of "ac", * so you can use 'ac' to encode one more code point. In this case * a is "--&", "b" is "amp;", and "c" is "gt;" and we use ac to * encode "-->" (which is otherwise unspeakable in wikitext). * * Note that any user content which does not match the regular * expression /--(>|&(amp;)*gt;)/ is unchanged in its wikitext * representation, as shown in the first two examples below. * * User-authored comment text Wikitext HTML5 DOM * -------------------------- ------------- ---------------------- * & - > & - > & + > * Use > here Use > here Use &gt; here * --> --> ++> * --> --&gt; ++&gt; * --&gt; --&amp;gt; ++&amp;gt; * * [0] http://www.w3.org/TR/html5/syntax.html#comment-start-state * [1] http://www.w3.org/TR/html5/syntax.html#comments * * Map a wikitext-escaped comment to an HTML DOM-escaped comment. * * @param string $comment Wikitext-escaped comment. * @return string DOM-escaped comment. */ public static function encodeComment( string $comment ): string { // Undo wikitext escaping to obtain "true value" of comment. $trueValue = preg_replace_callback( '/--&(amp;)*gt;/', static function ( $m ) { return Utils::decodeWtEntities( $m[0] ); }, $comment ); // Now encode '-', '>' and '&' in the "true value" as HTML entities, // so that they can be safely embedded in an HTML comment. // This part doesn't have to map strings 1-to-1. return preg_replace_callback( '/[->&]/', static function ( $m ) { return Utils::entityEncodeAll( $m[0] ); }, $trueValue ); } /** * Map an HTML DOM-escaped comment to a wikitext-escaped comment. * @param string $comment DOM-escaped comment. * @return string Wikitext-escaped comment. */ public static function decodeComment( string $comment ): string { // Undo HTML entity escaping to obtain "true value" of comment. $trueValue = Utils::decodeWtEntities( $comment ); // ok, now encode this "true value" of the comment in such a way // that the string "-->" never shows up. (See above.) return preg_replace_callback( '/--(&(amp;)*gt;|>)/', static function ( $m ) { $s = $m[0]; return $s === '-->' ? '-->' : '--&' . substr( $s, 3 ); }, $trueValue ); } /** * Utility function: we often need to know the wikitext DSR length for * an HTML DOM comment value. * * @param Comment|CommentTk $node A comment node containing a DOM-escaped comment. * @return int The wikitext length in UTF-8 bytes necessary to encode this * comment, including 7 characters for the `<!--` and `-->` delimiters. */ public static function decodedCommentLength( $node ): int { // Add 7 for the "<!--" and "-->" delimiters in wikitext. $syntaxLen = 7; if ( $node instanceof Comment ) { $value = $node->nodeValue; if ( $node->previousSibling && DOMUtils::hasTypeOf( $node->previousSibling, "mw:Placeholder/UnclosedComment" ) ) { $syntaxLen = 4; } } elseif ( $node instanceof CommentTk ) { // @phan-suppress-next-line PhanUndeclaredProperty dynamic property if ( isset( $node->dataParsoid->unclosedComment ) ) { $syntaxLen = 4; } $value = $node->value; } else { throw new UnreachableException( 'Should not be here!' ); } return strlen( self::decodeComment( $value ) ) + $syntaxLen; } /** * @param Node $node * @return ?string */ public static function getExtTagName( Node $node ): ?string { $match = DOMUtils::matchTypeOf( $node, '#^mw:Extension/(.+?)$#D' ); return $match ? mb_strtolower( substr( $match, strlen( 'mw:Extension/' ) ) ) : null; } /** * @param Env $env * @param Node $node * @return ?ExtensionTagHandler */ public static function getNativeExt( Env $env, Node $node ): ?ExtensionTagHandler { $extTagName = self::getExtTagName( $node ); return $extTagName ? $env->getSiteConfig()->getExtTagImpl( $extTagName ) : null; } /** * Is this an include directive? * @param string $name * @return bool */ public static function isIncludeTag( string $name ): bool { return $name === 'includeonly' || $name === 'noinclude' || $name === 'onlyinclude'; } /** * Check if tag is annotation or extension directive * Adapted from similar grammar function * * @param Env $env * @param string $name * @return bool */ public static function isAnnOrExtTag( Env $env, string $name ): bool { $tagName = mb_strtolower( $name ); $siteConfig = $env->getSiteConfig(); $extTags = $siteConfig->getExtensionTagNameMap(); $isInstalledExt = isset( $extTags[$tagName] ); $isIncludeTag = self::isIncludeTag( $tagName ); $isAnnotationTag = $siteConfig->isAnnotationTag( $tagName ); if ( !$isAnnotationTag ) { // avoid crashing on <tvar|name> even if we don't support that syntax explicitly $pipepos = strpos( $tagName, '|' ); if ( $pipepos ) { $strBeforePipe = substr( $tagName, 0, $pipepos ); $isAnnotationTag = $siteConfig->isAnnotationTag( $strBeforePipe ); } } return $isInstalledExt || $isIncludeTag || $isAnnotationTag; } /** * Creates a DocumentFragment containing a single span with type "mw:I18n". The created span * should be filled in with setDataNodeI18n to be valid. * @param Document $doc * @return DocumentFragment * @throws DOMException */ public static function createEmptyLocalizationFragment( Document $doc ): DocumentFragment { $frag = $doc->createDocumentFragment(); $span = $doc->createElement( 'span' ); DOMUtils::addTypeOf( $span, 'mw:I18n' ); $frag->appendChild( $span ); return $frag; } /** * Creates an internationalization (i18n) message that will be localized into the page content * language. The returned DocumentFragment contains, as a single child, a span * element with the appropriate information for later localization. * @param Document $doc * @param string $key message key for the message to be localized * @param ?array $params parameters for localization * @return DocumentFragment * @throws DOMException */ public static function createPageContentI18nFragment( Document $doc, string $key, ?array $params = null ): DocumentFragment { $frag = self::createEmptyLocalizationFragment( $doc ); $i18n = I18nInfo::createPageContentI18n( $key, $params ); DOMDataUtils::setDataNodeI18n( $frag->firstChild, $i18n ); return $frag; } /** * Creates an internationalization (i18n) message that will be localized into the user * interface language. The returned DocumentFragment contains, as a single child, a span * element with the appropriate information for later localization. * @param Document $doc * @param string $key message key for the message to be localized * @param ?array $params parameters for localization * @return DocumentFragment * @throws DOMException */ public static function createInterfaceI18nFragment( Document $doc, string $key, ?array $params = null ): DocumentFragment { $frag = self::createEmptyLocalizationFragment( $doc ); $i18n = I18nInfo::createInterfaceI18n( $key, $params ); DOMDataUtils::setDataNodeI18n( $frag->firstChild, $i18n ); return $frag; } /** * Creates an internationalization (i18n) message that will be localized into an arbitrary * language. The returned DocumentFragment contains, as a single child, a span * element with the appropriate information for later localization. * The use of this method is discouraged; use ::createPageContentI18nFragment(...) and * ::createInterfaceI18nFragment(...) where possible rather than, respectively, * ::createLangI18nFragment(..., $wgContLang, ...) and * ::createLangI18nFragment(..., $wgLang,...). * @param Document $doc * @param Bcp47Code $lang language for the localization * @param string $key message key for the message to be localized * @param ?array $params parameters for localization * @return DocumentFragment * @throws DOMException */ public static function createLangI18nFragment( Document $doc, Bcp47Code $lang, string $key, ?array $params = null ): DocumentFragment { $frag = self::createEmptyLocalizationFragment( $doc ); $i18n = I18nInfo::createLangI18n( $lang, $key, $params ); DOMDataUtils::setDataNodeI18n( $frag->firstChild, $i18n ); return $frag; } /** * Adds to $element the internationalization information needed for the attribute $name to be * localized in a later pass into the page content language. * @param Element $element element on which to add internationalization information * @param string $name name of the attribute whose value will be localized * @param string $key message key used for the attribute value localization * @param ?array $params parameters for localization */ public static function addPageContentI18nAttribute( Element $element, string $name, string $key, ?array $params = null ): void { $i18n = I18nInfo::createPageContentI18n( $key, $params ); DOMUtils::addTypeOf( $element, 'mw:LocalizedAttrs' ); DOMDataUtils::setDataAttrI18n( $element, $name, $i18n ); } /** Adds to $element the internationalization information needed for the attribute $name to be * localized in a later pass into the user interface language. * @param Element $element element on which to add internationalization information * @param string $name name of the attribute whose value will be localized * @param string $key message key used for the attribute value localization * @param ?array $params parameters for localization */ public static function addInterfaceI18nAttribute( Element $element, string $name, string $key, ?array $params = null ): void { $i18n = I18nInfo::createInterfaceI18n( $key, $params ); DOMUtils::addTypeOf( $element, 'mw:LocalizedAttrs' ); DOMDataUtils::setDataAttrI18n( $element, $name, $i18n ); } /** * Adds to $element the internationalization information needed for the attribute $name to be * localized in a later pass into the provided language. * The use of this method is discouraged; ; use ::addPageContentI18nAttribute(...) and * ::addInterfaceI18nAttribute(...) where possible rather than, respectively, * ::addLangI18nAttribute(..., $wgContLang, ...) and ::addLangI18nAttribute(..., $wgLang, ...). * @param Element $element element on which to add internationalization information * @param Bcp47Code $lang language in which the message will be localized * @param string $name name of the attribute whose value will be localized * @param string $key message key used for the attribute value localization * @param ?array $params parameters for localization */ public static function addLangI18nAttribute( Element $element, Bcp47Code $lang, string $name, string $key, ?array $params = null ): void { $i18n = I18nInfo::createLangI18n( $lang, $key, $params ); DOMUtils::addTypeOf( $element, 'mw:LocalizedAttrs' ); DOMDataUtils::setDataAttrI18n( $element, $name, $i18n ); } /** Check whether a node is an annotation meta; if yes, returns its type * @param Node $node * @return ?string */ public static function matchAnnotationMeta( Node $node ): ?string { return DOMUtils::matchNameAndTypeOf( $node, 'meta', self::ANNOTATION_META_TYPE_REGEXP ); } /** * Extract the annotation type, excluding potential "/End" suffix; returns null if not a valid * annotation meta. &$isStart is set to true if the annotation is a start tag, false otherwise. * * @param Node $node * @param bool &$isStart * @return ?string The matched type, or null if no match. */ public static function extractAnnotationType( Node $node, bool &$isStart = false ): ?string { $t = DOMUtils::matchTypeOf( $node, self::ANNOTATION_META_TYPE_REGEXP ); if ( $t !== null && preg_match( self::ANNOTATION_META_TYPE_REGEXP, $t, $matches ) ) { $isStart = !str_ends_with( $t, '/End' ); return $matches[1]; } return null; } /** * Check whether a node is a meta signifying the start of an annotated part of the DOM * * @param Node $node * @return bool */ public static function isAnnotationStartMarkerMeta( Node $node ): bool { if ( !$node instanceof Element || DOMCompat::nodeName( $node ) !== 'meta' ) { return false; } $isStart = false; $t = self::extractAnnotationType( $node, $isStart ); return $t !== null && $isStart; } /** * Check whether a node is a meta signifying the end of an annotated part of the DOM * * @param Node $node * @return bool */ public static function isAnnotationEndMarkerMeta( Node $node ): bool { if ( !$node instanceof Element || DOMCompat::nodeName( $node ) !== 'meta' ) { return false; } $isStart = false; $t = self::extractAnnotationType( $node, $isStart ); return $t !== null && !$isStart; } /** * Check whether the meta tag was moved from its initial position * @param Node $node * @return bool */ public static function isMovedMetaTag( Node $node ): bool { if ( $node instanceof Element && self::matchAnnotationMeta( $node ) !== null ) { $parsoidData = DOMDataUtils::getDataParsoid( $node ); if ( isset( $parsoidData->wasMoved ) ) { return $parsoidData->wasMoved; } } return false; } /** Returns true if a node is a (start or end) annotation meta tag * @param ?Node $n * @return bool */ public static function isMarkerAnnotation( ?Node $n ): bool { return $n !== null && self::matchAnnotationMeta( $n ) !== null; } /** * Extracts the media format from the attribute string * * @param Element $node * @return string */ public static function getMediaFormat( Element $node ): string { // TODO: Remove "Image|Video|Audio" when version 2.4.0 of the content // is no longer supported $mediaType = DOMUtils::matchTypeOf( $node, '#^mw:(File|Image|Video|Audio)(/|$)#' ); $parts = explode( '/', $mediaType ?? '' ); return $parts[1] ?? ''; } /** * @param Element $node * @return bool */ public static function hasVisibleCaption( Element $node ): bool { $format = self::getMediaFormat( $node ); return in_array( $format, [ 'Thumb', /* 'Manualthumb', FIXME(T305759) */ 'Frame' ], true ); } /** * Ref dom post-processing happens after adding media info, so the * linkbacks aren't available in the textContent added to the alt. * However, when serializing, they are in the caption elements. So, this * special handler drops the linkbacks for the purpose of comparison. * * @param Node $node * @return string */ public static function textContentFromCaption( Node $node ): string { $content = ''; $c = $node->firstChild; while ( $c ) { if ( $c instanceof Text ) { $content .= $c->nodeValue; } elseif ( $c instanceof Element && !DOMUtils::isMetaDataTag( $c ) && !DOMUtils::hasTypeOf( $c, "mw:Extension/ref" ) ) { $content .= self::textContentFromCaption( $c ); } $c = $c->nextSibling; } return $content; } } PK ! �S �S Utils.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Utils; use Psr\Log\LoggerInterface; use Wikimedia\Bcp47Code\Bcp47Code; use Wikimedia\Bcp47Code\Bcp47CodeValue; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\Config\SiteConfig; use Wikimedia\Parsoid\Core\DomSourceRange; use Wikimedia\Parsoid\Core\Sanitizer; use Wikimedia\Parsoid\NodeData\DataMw; use Wikimedia\Parsoid\Tokens\Token; use Wikimedia\Parsoid\Wikitext\Consts; /** * This file contains general utilities for token transforms. */ class Utils { /** * Regular expression fragment for matching wikitext comments. * Meant for inclusion in other regular expressions. */ // Maintenance note: this is used in /x regexes so all whitespace and # should be escaped public const COMMENT_REGEXP_FRAGMENT = '<!--(?>[\s\S]*?-->)'; /** Regular fragment for matching a wikitext comment */ public const COMMENT_REGEXP = '/' . self::COMMENT_REGEXP_FRAGMENT . '/'; /** * Strip Parsoid id prefix from aboutID * * @param string $aboutId aboud ID string * @return string */ public static function stripParsoidIdPrefix( string $aboutId ): string { // 'mwt' is the prefix used for new ids return preg_replace( '/^#?mwt/', '', $aboutId ); } /** * Strip PHP namespace from the fully qualified class name * @param string $className * @return string */ public static function stripNamespace( string $className ): string { return preg_replace( '/.*\\\\/', '', $className ); } /** * Check for Parsoid id prefix in an aboutID string * * @param string $aboutId aboud ID string * @return bool */ public static function isParsoidObjectId( string $aboutId ): bool { // 'mwt' is the prefix used for new ids return str_starts_with( $aboutId, '#mwt' ); } /** * Determine if the named tag is void (can not have content). * * @param string $name tag name * @return bool */ public static function isVoidElement( string $name ): bool { return isset( Consts::$HTML['VoidTags'][$name] ); } /** * recursive deep clones helper function * * @param object $el object * @return object */ private static function recursiveClone( $el ) { return self::clone( $el, true ); } /** * Deep clones by default. * @param object|array $obj arrays or plain objects * Tokens or DOM nodes shouldn't be passed in. * * CAVEAT: It looks like debugging methods pass in arrays * that can have DOM nodes. So, for debugging purposes, * we handle top-level DOM nodes or DOM nodes embedded in arrays * But, this will miserably fail if an object embeds a DOM node. * * @param bool $deepClone * @param bool $debug * @return object|array */ public static function clone( $obj, $deepClone = true, $debug = false ) { if ( $debug ) { if ( $obj instanceof \DOMNode ) { return $obj->cloneNode( $deepClone ); } if ( is_array( $obj ) ) { if ( $deepClone ) { return array_map( static function ( $o ) { return Utils::clone( $o, true, true ); }, $obj ); } else { return $obj; // Copy-on-write cloning } } } if ( !$deepClone && is_object( $obj ) ) { return clone $obj; } // FIXME, see T161647 // This will fail if $obj is (or embeds) a DOMNode return unserialize( serialize( $obj ) ); } /** * Extract the last *unicode* character of the string. * This might be more than one byte, if the last character * is non-ASCII. * @param string $str * @param ?int $idx The index *after* the character to extract; defaults * to the length of $str, which will extract the last character in * $str. * @return string */ public static function lastUniChar( string $str, ?int $idx = null ): string { if ( $idx === null ) { $idx = strlen( $str ); } elseif ( $idx <= 0 || $idx > strlen( $str ) ) { return ''; } $c = $str[--$idx]; while ( ( ord( $c ) & 0xC0 ) === 0x80 ) { $c = $str[--$idx] . $c; } return $c; } /** * Return true if the first character in $s is a unicode word character. * @param string $s * @return bool */ public static function isUniWord( string $s ): bool { return preg_match( '#^\w#u', $s ) === 1; } /** * This should not be used. * @param string $txt URL to encode using PHP encoding * @return string */ public static function phpURLEncode( $txt ) { // @phan-suppress-previous-line PhanPluginNeverReturnMethod throw new \BadMethodCallException( 'Use urlencode( $txt ) instead' ); } /** * Percent-decode only valid UTF-8 characters, leaving other encoded bytes alone. * * Distinct from `decodeURIComponent` in that certain escapes are not decoded, * matching the behavior of JavaScript's decodeURI(). * * @see https://www.ecma-international.org/ecma-262/6.0/#sec-decodeuri-encodeduri * @param string $s URI to be decoded * @return string */ public static function decodeURI( string $s ): string { // Escape the '%' in sequences for the reserved characters, then use decodeURIComponent. $s = preg_replace( '/%(?=2[346bcfBCF]|3[abdfABDF]|40)/', '%25', $s ); return self::decodeURIComponent( $s ); } /** * Percent-decode only valid UTF-8 characters, leaving other encoded bytes alone. * * @param string $s URI to be decoded * @return string */ public static function decodeURIComponent( string $s ): string { // Most of the time we should have valid input $ret = rawurldecode( $s ); if ( mb_check_encoding( $ret, 'UTF-8' ) ) { return $ret; } // Extract each encoded character and decode it individually return preg_replace_callback( // phpcs:ignore Generic.Files.LineLength.TooLong '/%[0-7][0-9A-F]|%[CD][0-9A-F]%[89AB][0-9A-F]|%E[0-9A-F](?:%[89AB][0-9A-F]){2}|%F[0-4](?:%[89AB][0-9A-F]){3}/i', static function ( $match ) { $ret = rawurldecode( $match[0] ); return mb_check_encoding( $ret, 'UTF-8' ) ? $ret : $match[0]; }, $s ); } /** * Extract extension source from the token * * @param Token $token token * @return string */ public static function extractExtBody( Token $token ): string { $src = $token->getAttributeV( 'source' ); $extTagOffsets = $token->dataParsoid->extTagOffsets; '@phan-var \Wikimedia\Parsoid\Core\DomSourceRange $extTagOffsets'; return $extTagOffsets->stripTags( $src ); } /** * Helper function checks numeric values * * @param ?int $n checks parameters for numeric type and value zero or positive * @return bool */ private static function isValidOffset( ?int $n ): bool { return $n !== null && $n >= 0; } /** * Basic check if a DOM Source Range (DSR) is valid. * * Clarifications about the "basic validity checks": * - Only checks for underflow, not for overflow. * - Does not verify that start <= end * - Does not verify that openWidth + endWidth <= end - start * (even so, the values might be invalid because of content) * These would be overkill for our purposes. Given how DSR computation * works in thie codebase, the real scenarios we care about are * non-null / non-negative values since that can happen. * * @param ?DomSourceRange $dsr DSR source range values * @param bool $all Also check the widths of the container tag * @return bool */ public static function isValidDSR( ?DomSourceRange $dsr, bool $all = false ): bool { return $dsr !== null && self::isValidOffset( $dsr->start ) && self::isValidOffset( $dsr->end ) && ( !$all || ( self::isValidOffset( $dsr->openWidth ) && self::isValidOffset( $dsr->closeWidth ) ) ); } /** * Cannonicalizes a namespace name. * * @param string $name Non-normalized namespace name. * @return string */ public static function normalizeNamespaceName( string $name ): string { return strtr( mb_strtolower( $name ), ' ', '_' ); } /** * Decode HTML5 entities in wikitext. * * NOTE that wikitext only allows semicolon-terminated entities, while * HTML allows a number of "legacy" entities to be decoded without * a terminating semicolon. This function deliberately does not * decode these HTML-only entity forms. * * @param string $text * @return string */ public static function decodeWtEntities( string $text ): string { // Note that HTML5 allows semicolon-less entities which // wikitext does not: in wikitext all entities must end in a // semicolon. // By normalizing before decoding, this routine deliberately // does not decode entity references which are invalid in wikitext // (mostly because they decode to invalid codepoints). return Sanitizer::decodeCharReferences( Sanitizer::normalizeCharReferences( $text ) ); } /** * Entity-escape anything that would decode to a valid wikitext entity. * * Note that HTML5 allows certain "semicolon-less" entities, like * `¶`; these aren't allowed in wikitext and won't be escaped * by this function. * * @param string $text * @return string */ public static function escapeWtEntities( string $text ): string { // We just want to encode ampersands that precede valid entities. // (And note that semicolon-less entities aren't valid wikitext.) return preg_replace_callback( '/&[#0-9a-zA-Z\x80-\xff]+;/', function ( $match ) { $m = $match[0]; $decodedChar = self::decodeWtEntities( $m ); if ( $decodedChar !== $m ) { // Escape the ampersand return '&' . substr( $m, 1 ); } else { // Not an entity, just return the string return $m; } }, $text ); } /** * Convert special characters to HTML entities * * @param string $s * @return string */ public static function escapeHtml( string $s ): string { // Only encodes five characters: " ' & < > $s = htmlspecialchars( $s, ENT_QUOTES | ENT_HTML5 ); $s = str_replace( "\u{0338}", '̸', $s ); return $s; } /** * Encode all characters as entity references. This is done to make * characters safe for wikitext (regardless of whether they are * HTML-safe). Typically only called with single-codepoint strings. * @param string $s * @return string */ public static function entityEncodeAll( string $s ): string { // This is Unicode aware. static $conventions = [ // We always use at least two characters for the hex code '�' => '�', '' => '', '' => '', '' => '', '' => '', '' => '', '' => '', '' => '', '' => '', '	' => '	', '
' => '
', '' => '', '' => '', '
' => '
', '' => '', '' => '', // By convention we use where possible ' ' => ' ', ]; return strtr( mb_encode_numericentity( $s, [ 0, 0x10ffff, 0, ~0 ], 'utf-8', true ), $conventions ); } /** * Determine whether the protocol of a link is potentially valid. Use the * environment's per-wiki config to do so. * * @param mixed $linkTarget * @param Env $env * @return bool */ public static function isProtocolValid( $linkTarget, Env $env ): bool { $siteConf = $env->getSiteConfig(); if ( is_string( $linkTarget ) ) { return $siteConf->hasValidProtocol( $linkTarget ); } else { return true; } } /** * Get argument information for an extension tag token. * * @param Token $extToken * @return DataMw */ public static function getExtArgInfo( Token $extToken ): DataMw { $name = $extToken->getAttributeV( 'name' ); $options = $extToken->getAttributeV( 'options' ); $defaultDataMw = new DataMw( [ 'name' => $name, // T367616: 'attrs' should be renamed to 'extAttrs' 'attrs' => (object)TokenUtils::kvToHash( $options ), ] ); $extTagOffsets = $extToken->dataParsoid->extTagOffsets; if ( $extTagOffsets->closeWidth !== 0 ) { // If not self-closing... $defaultDataMw->body = (object)[ 'extsrc' => self::extractExtBody( $extToken ), ]; } return $defaultDataMw; } /** * Parse media dimensions * * @param SiteConfig $siteConfig * @param string $str media dimension string to parse * @param bool $onlyOne If set, returns null if multiple dimenstions are present * @param bool $localized Defaults to false; set to true if the $str * has already been matched against `img_width` to localize the `px` * suffix. * @return ?array{x:int,y?:int,bogusPx:bool} */ public static function parseMediaDimensions( SiteConfig $siteConfig, string $str, bool $onlyOne = false, bool $localized = false ): ?array { if ( !$localized ) { $getOption = $siteConfig->getMediaPrefixParameterizedAliasMatcher(); $bits = $getOption( $str ); $normalizedBit0 = $bits ? mb_strtolower( trim( $bits['k'] ) ) : null; if ( $normalizedBit0 === 'img_width' ) { $str = $bits['v']; } } $dimensions = null; // We support a trailing 'px' here for historical reasons // (T15500, T53628, T207032) if ( preg_match( '/^(\d*)(?:x(\d+))?\s*(px\s*)?$/D', $str, $match ) ) { $dimensions = [ 'x' => null, 'y' => null, 'bogusPx' => false ]; if ( !empty( $match[1] ) ) { $dimensions['x'] = intval( $match[1], 10 ); } if ( !empty( $match[2] ) ) { if ( $onlyOne ) { return null; } $dimensions['y'] = intval( $match[2], 10 ); } if ( !empty( $match[3] ) ) { $dimensions['bogusPx'] = true; } } return $dimensions; } /** * Validate media parameters * More generally, this is defined by the media handler in core * * @param ?int $num * @return bool */ public static function validateMediaParam( ?int $num ): bool { return $num !== null && $num > 0; } /** * FIXME: Is this needed?? * * Extract content in a backwards compatible way * * @param object $revision * @return object */ public static function getStar( $revision ) { // @phan-suppress-previous-line PhanPluginNeverReturnMethod /* $content = $revision; if ( $revision && isset( $revision->slots ) ) { $content = $revision->slots->main; } return $content; */ throw new \BadMethodCallException( "This method shouldn't be needed. " . "But, port this if you really need it." ); } /** * This regex was generated by running through *all unicode characters* and * testing them against *all regexes* for linktrails in a default MW install. * We had to treat it a little bit, here's what we changed: * * 1. A-Z, though allowed in Walloon, is disallowed. * 2. '"', though allowed in Chuvash, is disallowed. * 3. '-', though allowed in Icelandic (possibly due to a bug), is disallowed. * 4. '1', though allowed in Lak (possibly due to a bug), is disallowed. */ // phpcs:disable Generic.Files.LineLength.TooLong public static $linkTrailRegex = '/^[^\0-`{÷ĀĈ-ČĎĐĒĔĖĚĜĝĠ-ĪĬ-įIJĴ-ĹĻ-ĽĿŀŅņʼnŊŌŎŏŒŔŖ-ŘŜŝŠŤŦŨŪ-ŬŮŲ-ŴŶŸ' . 'ſ-ǤǦǨǪ-Ǯǰ-ȗȜ-ȞȠ-ɘɚ-ʑʓ-ʸʽ-̂̄-΅·Ϗ-ЯѐѝѠѢѤѦѨѪѬѮѰѲѴѶѸѺ-ѾҀ-҃҅-ҐҒҔҕҘҚҜ-ҠҤ-ҪҬҭҰҲ' . 'Ҵ-ҶҸҹҼ-ҿӁ-ӗӚ-ӜӞӠ-ӢӤӦӪ-ӲӴӶ-ՠֈ--ؠً-ٳٵ-ٽٿ-څڇ-ڗڙ-ڨڪ-ڬڮڰ-ڽڿ-ۅۈ-ۊۍ-۔ۖ--' . '---੯ੴ-ჱ-ẼẾ-\x{200b}\x{200d}-‒—-‗‚‛”--\x{fffd}]+$/D'; // phpcs:enable Generic.Files.LineLength.TooLong /** * Check whether some text is a valid link trail. * * @param string $text * @return bool */ public static function isLinkTrail( string $text ): bool { return $text !== '' && preg_match( self::$linkTrailRegex, $text ); } /** * Convert BCP-47-compliant language code to MediaWiki-internal code. * * This is a temporary back-compatibility hack; Parsoid should be * using BCP 47 strings or Bcp47Code objects in all its external APIs. * Try to avoid using it, though: there's no guarantee * that this mapping will remain in sync with upstream. * * @param string|Bcp47Code $code BCP-47 language code * @return string MediaWiki-internal language code */ public static function bcp47ToMwCode( $code ): string { // This map is dumped from // LanguageCode::NON_STANDARD_LANGUAGE_CODE_MAPPING in core, but // with keys and values swapped and BCP-47 codes lowercased: // // array_flip(array_map(strtolower, // LanguageCode::NON_STANDARD_LANGUAGE_CODE_MAPPING)) // // Hopefully we will be able to deprecate and remove this from // Parsoid quickly enough that keeping it in sync with upstream // is not an issue. static $MAP = [ "cbk" => "cbk-zam", "de-x-formal" => "de-formal", "egl" => "eml", "en-x-rtl" => "en-rtl", "es-x-formal" => "es-formal", "hu-x-formal" => "hu-formal", "jv-x-bms" => "map-bms", "ro-cyrl-md" => "mo", "nrf" => "nrm", "nl-x-informal" => "nl-informal", "nap-x-tara" => "roa-tara", "en-simple" => "simple", "sr-cyrl" => "sr-ec", "sr-latn" => "sr-el", "zh-hans-cn" => "zh-cn", "zh-hans-sg" => "zh-sg", "zh-hans-my" => "zh-my", "zh-hant-tw" => "zh-tw", "zh-hant-hk" => "zh-hk", "zh-hant-mo" => "zh-mo", ]; if ( $code instanceof Bcp47Code ) { $code = $code->toBcp47Code(); } $code = strtolower( $code ); // All MW-internal codes are lowercase return $MAP[$code] ?? $code; } /** * Convert MediaWiki-internal language code to a BCP-47-compliant * language code suitable for including in HTML. * * This is a temporary back-compatibility hack, needed for compatibility * when running in standalone mode with MediaWiki Action APIs which expose * internal language codes. These APIs should eventually be improved * so that they also expose BCP-47 compliant codes, which can then be * used directly by Parsoid without conversion. But until that day * comes, this function will paper over the differences. * * Note that MediaWiki-internal Language objects implement Bcp47Code, * so we can transition interfaces which currently take a string code * to pass a Language object instead; that will make this method * effectively a no-op and avoid the issue of upstream sync of the * mapping table. * * @param string|Bcp47Code $code MediaWiki-internal language code or object * @param bool $strict If true, this code will log a deprecation message * or fail if a MediaWiki-internal language code is passed. * @param ?LoggerInterface $warnLogger A deprecation warning will be * emitted on $warnLogger if $strict is true and a string-valued * MediaWiki-internal language code is passed; otherwise an exception * will be thrown. * @return Bcp47Code BCP-47 language code. * @see LanguageCode::bcp47() */ public static function mwCodeToBcp47( $code, bool $strict = false, ?LoggerInterface $warnLogger = null ): Bcp47Code { if ( $code instanceof Bcp47Code ) { return $code; } if ( $strict ) { $msg = "Use of string-valued BCP-47 codes is deprecated."; if ( defined( 'MW_PHPUNIT_TEST' ) || defined( 'MW_PARSER_TEST' ) ) { // Always throw an error if running tests throw new \Error( $msg ); } if ( $warnLogger ) { $warnLogger->warning( $msg ); } else { // Strict mode requested but no deprecation logger provided throw new \Error( $msg ); } } // This map is dumped from // LanguageCode::getNonstandardLanguageCodeMapping() in core. // Hopefully we will be able to deprecate and remove this method // from Parsoid quickly enough that keeping it in sync with upstream // will not be an issue. static $MAP = [ "als" => "gsw", "bat-smg" => "sgs", "be-x-old" => "be-tarask", "fiu-vro" => "vro", "roa-rup" => "rup", "zh-classical" => "lzh", "zh-min-nan" => "nan", "zh-yue" => "yue", "cbk-zam" => "cbk", "de-formal" => "de-x-formal", "eml" => "egl", "en-rtl" => "en-x-rtl", "es-formal" => "es-x-formal", "hu-formal" => "hu-x-formal", "map-bms" => "jv-x-bms", "mo" => "ro-Cyrl-MD", "nrm" => "nrf", "nl-informal" => "nl-x-informal", "roa-tara" => "nap-x-tara", "simple" => "en-simple", "sr-ec" => "sr-Cyrl", "sr-el" => "sr-Latn", "zh-cn" => "zh-Hans-CN", "zh-sg" => "zh-Hans-SG", "zh-my" => "zh-Hans-MY", "zh-tw" => "zh-Hant-TW", "zh-hk" => "zh-Hant-HK", "zh-mo" => "zh-Hant-MO", ]; $code = $MAP[$code] ?? $code; // The rest of this code is copied verbatim from LanguageCode::bcp47() // in core. $codeSegment = explode( '-', $code ); $codeBCP = []; foreach ( $codeSegment as $segNo => $seg ) { // when previous segment is x, it is a private segment and should be lc if ( $segNo > 0 && strtolower( $codeSegment[( $segNo - 1 )] ) == 'x' ) { $codeBCP[$segNo] = strtolower( $seg ); // ISO 3166 country code } elseif ( ( strlen( $seg ) == 2 ) && ( $segNo > 0 ) ) { $codeBCP[$segNo] = strtoupper( $seg ); // ISO 15924 script code } elseif ( ( strlen( $seg ) == 4 ) && ( $segNo > 0 ) ) { $codeBCP[$segNo] = ucfirst( strtolower( $seg ) ); // Use lowercase for other cases } else { $codeBCP[$segNo] = strtolower( $seg ); } } return new Bcp47CodeValue( implode( '-', $codeBCP ) ); } /** * BCP 47 codes are case-insensitive, so this helper does a "proper" * comparison of Bcp47Code objects. * @param Bcp47Code $a * @param Bcp47Code $b * @return bool true iff $a and $b represent the same language */ public static function isBcp47CodeEqual( Bcp47Code $a, Bcp47Code $b ): bool { return strcasecmp( $a->toBcp47Code(), $b->toBcp47Code() ) === 0; } } PK ! <(#R/ / ComputeSelectiveStats.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Utils; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\Config\PageConfig; use Wikimedia\Parsoid\Core\PageBundle; use Wikimedia\Parsoid\DOM\Document; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\Html2Wt\DiffUtils; use Wikimedia\Parsoid\Html2Wt\DOMDiff; use Wikimedia\Parsoid\NodeData\TemplateInfo; /** * This file contains code to classify opportunities for selective * update and collect statistics. */ class ComputeSelectiveStats { /** @return array<string,string> */ public static function classify( Env $env, ?PageConfig $oldPage, ?PageBundle $oldPb, PageConfig $newPage, PageBundle $newPb ): array { // Default labels (ensure keys are consistent & in consistent order) $labels = [ 'type' => 'missing-prev', 'same-wt' => 'unknown', 'rev-diff' => 'unknown', 'changed-sections' => 'unknown', 'changed-template-sites' => 'unknown', 'changed-template-names' => 'unknown', ]; if ( $oldPage === null || $oldPb === null ) { return $labels; } $oldWt = self::pc2wt( $oldPage ); $newWt = self::pc2wt( $newPage ); // Compare wikitext in both revisions $labels['same-wt'] = self::bool2str( $oldWt == $newWt ); // Compare revision IDs $oldRev = $oldPage->getRevisionId(); $newRev = $newPage->getRevisionId(); if ( $oldRev === $newRev ) { // same revision (template update, most likely) $labels['rev-diff'] = '0'; } elseif ( $oldRev === $newPage->getParentRevisionId() ) { // "normal edit": new revision is the one after old revision $labels['rev-diff'] = '1'; } elseif ( $newRev === $oldPage->getParentRevisionId() ) { // new revision is the one *before* old revision // This is probably a render triggered from RevisionOutputCache // of the previous revision where the "oldRev" is coming from // the parser cache and is thus the latest. This may happen // during races, vandalism patrol, HTML diffing, etc. $labels['rev-diff'] = 'minus1'; } // Parse to DOM and diff $oldDoc = self::pb2doc( $env, $oldPb ); $newDoc = self::pb2doc( $env, $newPb ); $dd = new DOMDiff( $env ); // Don't skip over template content! $dd->skipEncapsulatedContent = false; // Ignore differences in data-parsoid 'dsr' and 'tmp' $cleanDP = static function ( $dp ) { $dp = $dp->clone(); foreach ( [ 'tmp', 'tsr', 'dsr', 'extTagOffsets', 'extLinkContentOffsets' ] as $prop ) { unset( $dp->$prop ); } return $dp; }; $dd->specializedAttribHandlers['data-parsoid'] = static function ( $nA, $vA, $nB, $vB ) use ( $cleanDP ) { return $cleanDP( $vA ) == $cleanDP( $vB ); }; // Ignore differences in 'id' attributes, since these are a side-effect // of data-parsoid/page bundle encapsulation. $dd->specializedAttribHandlers['id'] = static function ( $nA, $vA, $nB, $vB ) { // XXX we can't really tell synthethic ID attributes created by // DOMDataUtils::storeInPageBundle() from "real" ID attributes // in user wikitext. Hackishly ignore differences in any ID // attributes that begin with 'mw' even though technically you // could have a <span id="mw-something'> in wikitext, and change // that to <span id='mw-different-thing'> and with this attribute // handler DOM diff wouldn't flag the change. In theory we should // be using shadow attributes to record when an id was synthetic. if ( str_starts_with( $vA, 'mw' ) && str_starts_with( $vB, 'mw' ) ) { return true; // equal enough } return $vA === $vB; }; [ 'isEmpty' => $emptyDiff ] = $dd->diff( DOMCompat::getBody( $oldDoc ), DOMCompat::getBody( $newDoc ) ); if ( $oldWt === $newWt ) { // old and new wikitext identical. is html also identical? $labels['type'] = $emptyDiff ? 'no-op' : 'template-update'; } else { $labels['type'] = 'page-update'; } // Use a DOMTraverser to count how many sections and templates were // modified. (Skip attribute embedded HTML for now.) $dt = new DOMTraverser( true ); $sectionsModified = 0; $dt->addHandler( 'section', static function ( Element $el ) use ( &$sectionsModified ) { if ( WTUtils::isParsoidSectionTag( $el ) && !DiffUtils::subtreeUnchanged( $el ) ) { $sectionsModified++; } return true; } ); $templatesModified = 0; $namedTemplates = []; $dt->addHandler( null, static function ( $el, $state ) use ( &$templatesModified, &$namedTemplates ) { if ( !( $el instanceof Element ) ) { return true; } if ( $el === ( $state->tplInfo->first ?? null ) && DOMUtils::hasTypeOf( $el, 'mw:Transclusion' ) ) { $changed = false; $about = DOMCompat::getAttribute( $el, 'about' ); foreach ( WTUtils::getAboutSiblings( $el, $about ) as $sib ) { // Note that we might miss a change here in a sibling // which is fosterable IEW, since that's !Element. if ( $sib instanceof Element && !DiffUtils::subtreeUnchanged( $sib ) ) { $changed = true; break; } } // Compute the number of templates modified if ( $changed ) { $templatesModified++; $dataMw = DOMDataUtils::getDataMw( $el ); $name = null; foreach ( $dataMw->parts ?? [] as $part ) { if ( $part instanceof TemplateInfo ) { $name ??= $part->href; } } $namedTemplates[$name ?? 'unknown'] = true; } // Don't recurse into templates, just tabulate top-level $state->tplInfo->clear = true; return $state->tplInfo->last->nextSibling; } return true; } ); # do the traversal $dt->traverse( null, DOMCompat::getBody( $newDoc ), new DTState( $env ) ); # report changed sections as '0', '1', or '2+' $labels['changed-sections'] = self::int2str( $sectionsModified, 2 ); # report changed templates as '0', '1', or '2+' $labels['changed-template-sites'] = self::int2str( $templatesModified, 2 ); # report the count of the *names* of the templates that were updated. $labels['changed-template-names'] = self::int2str( count( $namedTemplates ), 2 ); // TODO: sum up the time spent on modified (vs unmodified) templates return $labels; } // ----------- Helper functions --------------- /** Convert a page bundle to a DOM Document. */ private static function pb2doc( Env $env, PageBundle $pb ): Document { $doc = $pb->toDom(); DOMDataUtils::prepareDoc( $doc ); $body = DOMCompat::getBody( $doc ); '@phan-var Element $body'; // assert non-null DOMDataUtils::visitAndLoadDataAttribs( $body, [ 'markNew' => true ] ); return $doc; } /** Convert a PageConfig to a wikitext string. */ private static function pc2wt( PageConfig $pc ): string { return $pc->getRevisionContent()->getContent( 'main' ); } /** Convert a boolean to a string for labelling purposes. */ private static function bool2str( ?bool $val ): string { return ( $val === true ) ? 'true' : ( ( $val === false ) ? 'false' : 'unknown' ); } /** Convert an integer to a string for labelling purposes. */ private static function int2str( ?int $val, ?int $limit = null ): string { if ( $val === null ) { return 'unknown'; } if ( $limit !== null && $val >= $limit ) { return "{$limit}plus"; } return "$val"; } } PK ! �Q�@ TitleValue.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Utils; use Wikimedia\Parsoid\Core\LinkTarget; use Wikimedia\Parsoid\Core\LinkTargetTrait; /** * Lightweight title class */ class TitleValue implements LinkTarget { use LinkTargetTrait; /** @var string */ private $interwiki; /** @var int */ private $namespaceId; /** @var string */ private $dbkey; /** @var string */ private $fragment; /** * @param int $namespaceId * @param string $dbkey Page DBkey (with underscores, not spaces) * @param string $fragment Fragment suffix, or empty string if none * @param string $interwiki Interwiki prefix, or empty string if none */ private function __construct( int $namespaceId, string $dbkey, string $fragment = '', string $interwiki = '' ) { $this->namespaceId = $namespaceId; $this->dbkey = strtr( $dbkey, ' ', '_' ); $this->fragment = $fragment; $this->interwiki = $interwiki; } /** * Constructs a TitleValue, or returns null if the parameters are not valid. * * @note This does not perform any normalization, and only basic validation. * * @param int $namespace The namespace ID. This is not validated. * @param string $title The page title in either DBkey or text form. No normalization is applied * beyond underscore/space conversion. * @param string $fragment The fragment title. Use '' to represent the whole page. * No validation or normalization is applied. * @param string $interwiki The interwiki component. * No validation or normalization is applied. * @return TitleValue|null */ public static function tryNew( int $namespace, string $title, string $fragment = '', string $interwiki = '' ): ?TitleValue { return new static( $namespace, $title, $fragment, $interwiki ); } /** @inheritDoc */ public function getNamespace(): int { return $this->namespaceId; } /** @inheritDoc */ public function getFragment(): string { return $this->fragment; } /** @inheritDoc */ public function getDBkey(): string { return $this->dbkey; } /** @inheritDoc */ public function createFragmentTarget( string $fragment ): self { return new static( $this->namespaceId, $this->dbkey, $fragment, $this->interwiki ); } /** @inheritDoc */ public function getInterwiki(): string { return $this->interwiki; } } PK ! Y��J�0 �0 Title.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Utils; use Wikimedia\Assert\Assert; use Wikimedia\IPUtils; use Wikimedia\Parsoid\Config\SiteConfig; use Wikimedia\Parsoid\Core\LinkTarget; use Wikimedia\Parsoid\Core\LinkTargetTrait; class Title implements LinkTarget { use LinkTargetTrait; /** @var string */ private $interwiki; /** @var int */ private $namespaceId; /** @var string */ private $namespaceName; /** @var string */ private $dbkey; /** @var string */ private $fragment; // cached values of prefixed title/key private ?string $prefixedDBKey = null; private ?string $prefixedText = null; /** * @param string $interwiki Interwiki prefix, or empty string if none * @param string $key Page DBkey (with underscores, not spaces) * @param int $namespaceId * @param string $namespaceName (with spaces, not underscores) * @param ?string $fragment */ private function __construct( string $interwiki, string $key, int $namespaceId, string $namespaceName, ?string $fragment = null ) { $this->interwiki = $interwiki; $this->dbkey = $key; $this->namespaceId = $namespaceId; $this->namespaceName = $namespaceName; $this->fragment = $fragment ?? ''; } public static function newFromText( string $title, SiteConfig $siteConfig, ?int $defaultNs = null ): Title { if ( $defaultNs === null ) { $defaultNs = 0; } $origTitle = $title; if ( !mb_check_encoding( $title, 'UTF-8' ) ) { throw new TitleException( "Bad UTF-8 in title \"$origTitle\"", 'title-invalid-utf8', $origTitle ); } // Strip Unicode bidi override characters. $title = preg_replace( '/[\x{200E}\x{200F}\x{202A}-\x{202E}]+/u', '', $title ); if ( $title === null ) { throw new TitleException( "Bad UTF-8 in title \"$origTitle\"", 'title-invalid-utf8', $origTitle ); } // Clean up whitespace $title = preg_replace( '/[ _\x{00A0}\x{1680}\x{180E}\x{2000}-\x{200A}\x{2028}\x{2029}\x{202F}\x{205F}\x{3000}]+/u', '_', $title ); // Trim _ from beginning and end $title = trim( $title, '_' ); if ( str_contains( $title, \UtfNormal\Constants::UTF8_REPLACEMENT ) ) { throw new TitleException( "Bad UTF-8 in title \"$title\"", 'title-invalid-utf8', $title ); } // Initial colon indicates main namespace rather than specified default // but should not create invalid {ns,title} pairs such as {0,Project:Foo} if ( $title !== '' && $title[0] === ':' ) { $title = ltrim( substr( $title, 1 ), '_' ); $defaultNs = 0; } if ( $title === '' ) { throw new TitleException( 'Empty title', 'title-invalid-empty', $title ); } $ns = $defaultNs; $interwiki = null; # Namespace or interwiki prefix $prefixRegexp = "/^(.+?)_*:_*(.*)$/S"; // MediaWikiTitleCodec::splitTitleString wraps a loop around the // next section, to allow it to repeat this prefix processing if // an interwiki prefix is found which points at the local wiki. $m = []; if ( preg_match( $prefixRegexp, $title, $m ) ) { $p = $m[1]; $pLower = mb_strtolower( $p ); $nsId = $siteConfig->canonicalNamespaceId( $pLower ) ?? $siteConfig->namespaceId( $pLower ); if ( $nsId !== null ) { $title = $m[2]; $ns = $nsId; # For Talk:X pages, check if X has a "namespace" prefix if ( $nsId === $siteConfig->canonicalNamespaceId( 'talk' ) && preg_match( $prefixRegexp, $title, $x ) ) { $xLower = mb_strtolower( $x[1] ); if ( $siteConfig->namespaceId( $xLower ) ) { // Disallow Talk:File:x type titles. throw new TitleException( "Invalid Talk namespace title \"$origTitle\"", 'title-invalid-talk-namespace', $title ); } elseif ( $siteConfig->interwikiMapNoNamespaces()[$xLower] ?? null ) { // Disallow Talk:Interwiki:x type titles. throw new TitleException( "Invalid Talk namespace title \"$origTitle\"", 'title-invalid-talk-namespace', $title ); } } } elseif ( $siteConfig->interwikiMapNoNamespaces()[$pLower] ?? null ) { # Interwiki link $title = $m[2]; $interwiki = $pLower; # We don't check for a redundant interwiki prefix to the # local wiki, like core does here in # MediaWikiTitleCodec::splitTitleString; # core then does a `continue` to repeat the processing // If there's an initial colon after the interwiki, that also // resets the default namespace if ( $title !== '' && $title[0] === ':' ) { $title = trim( substr( $title, 1 ), '_' ); $ns = 0; } } # If there's no recognized interwiki or namespace, # then let the colon expression be part of the title } $fragment = null; $fragmentIndex = strpos( $title, '#' ); if ( $fragmentIndex !== false ) { $fragment = substr( $title, $fragmentIndex + 1 ); $title = rtrim( substr( $title, 0, $fragmentIndex ), '_' ); } $illegalCharsRe = '/[^' . $siteConfig->legalTitleChars() . ']' // URL percent encoding sequences interfere with the ability // to round-trip titles -- you can't link to them consistently. . '|%[0-9A-Fa-f]{2}' // XML/HTML character references produce similar issues. . '|&[A-Za-z0-9\x80-\xff]+;/S'; if ( preg_match( $illegalCharsRe, $title ) ) { throw new TitleException( "Invalid characters in title \"$origTitle\"", 'title-invalid-characters', $title ); } // Pages with "/./" or "/../" appearing in the URLs will often be // unreachable due to the way web browsers deal with 'relative' URLs. // Also, they conflict with subpage syntax. Forbid them explicitly. if ( str_contains( $title, '.' ) && ( $title === '.' || $title === '..' || str_starts_with( $title, './' ) || str_starts_with( $title, '../' ) || str_contains( $title, '/./' ) || str_contains( $title, '/../' ) || str_ends_with( $title, '/.' ) || str_ends_with( $title, '/..' ) ) ) { throw new TitleException( "Title \"$origTitle\" contains relative path components", 'title-invalid-relative', $title ); } // Magic tilde sequences? Nu-uh! if ( str_contains( $title, '~~~' ) ) { throw new TitleException( "Title \"$origTitle\" contains ~~~", 'title-invalid-magic-tilde', $title ); } $maxLength = $ns === $siteConfig->canonicalNamespaceId( 'special' ) ? 512 : 255; if ( strlen( $title ) > $maxLength ) { throw new TitleException( "Title \"$origTitle\" is too long", 'title-invalid-too-long', $title ); } if ( $interwiki === null && $siteConfig->namespaceCase( $ns ) === 'first-letter' ) { $title = $siteConfig->ucfirst( $title ); } # Can't make a link to a namespace alone... "empty" local links can only be # self-links with a fragment identifier. if ( $title === '' && $interwiki === null && $ns !== $siteConfig->canonicalNamespaceId( '' ) ) { throw new TitleException( 'Empty title', 'title-invalid-empty', $title ); } // This is from MediaWikiTitleCodec::splitTitleString() in core if ( $title !== '' && ( # T329690 $ns === $siteConfig->canonicalNamespaceId( 'user' ) || $ns === $siteConfig->canonicalNamespaceId( 'user_talk' ) ) ) { $title = IPUtils::sanitizeIP( $title ); } // Any remaining initial :s are illegal. if ( $title !== '' && $title[0] == ':' ) { throw new TitleException( 'Leading colon title', 'title-invalid-leading-colon', $title ); } // This is not in core's splitTitleString but matches // mediawiki-title's newFromText. if ( $ns === $siteConfig->canonicalNamespaceId( 'special' ) ) { $title = self::fixSpecialName( $siteConfig, $title ); } $namespaceName = $siteConfig->namespaceName( $ns ); return new self( $interwiki ?? '', $title, $ns, $namespaceName, $fragment ); } /** * The interwiki component of this LinkTarget. * This is the empty string if there is no interwiki component. * * @return string */ public function getInterwiki(): string { return $this->interwiki; } /** * Get the DBkey, prefixed with interwiki prefix if any. * This is Parsoid's convention, which differs from core; * use ::getDBkey() for a method compatible with core's * convention. * * @return string * @see ::getDBkey() * @deprecated */ public function getKey(): string { if ( $this->interwiki ) { return $this->interwiki . ':' . $this->dbkey; } return $this->dbkey; } /** * Get the main part of the link target, in canonical database form. * * The main part is the link target without namespace prefix or hash fragment. * The database form means that spaces become underscores, this is also * used for URLs. * * @return string */ public function getDBkey(): string { return $this->dbkey; } /** * Get the prefixed DBkey * @return string */ public function getPrefixedDBKey(): string { if ( $this->prefixedDBKey === null ) { $this->prefixedDBKey = $this->interwiki === '' ? '' : ( $this->interwiki . ':' ); $this->prefixedDBKey .= $this->namespaceName === '' ? '' : ( strtr( $this->namespaceName, ' ', '_' ) . ':' ); $this->prefixedDBKey .= $this->getDBkey(); } return $this->prefixedDBKey; } /** * Get the prefixed text * @return string */ public function getPrefixedText(): string { if ( $this->prefixedText === null ) { $this->prefixedText = $this->interwiki === '' ? '' : ( $this->interwiki . ':' ); $this->prefixedText .= $this->namespaceName === '' ? '' : ( $this->namespaceName . ':' ); $this->prefixedText .= $this->getText(); } return $this->prefixedText; } /** * Get the prefixed title with spaces, plus any fragment * (part beginning with '#') * * @return string The prefixed title, with spaces and the fragment, including '#' */ public function getFullText(): string { $text = $this->getPrefixedText(); if ( $this->hasFragment() ) { $text .= '#' . $this->getFragment(); } return $text; } /** * Get the namespace ID * @return int */ public function getNamespace(): int { return $this->namespaceId; } /** * Get the human-readable name for the namespace * (with spaces, not underscores). * @return string */ public function getNamespaceName(): string { return $this->namespaceName; } /** * Get the link fragment in text form (i.e. the bit after the hash `#`). * * @return string link fragment */ public function getFragment(): string { return $this->fragment ?? ''; } /** * Compare with another title. * * @param Title $title * @return bool */ public function equals( Title $title ) { return $this->getNamespace() === $title->getNamespace() && $this->getInterwiki() === $title->getInterwiki() && $this->getDBkey() === $title->getDBkey(); } /** * Returns true if this is a special page. * * @return bool */ public function isSpecialPage() { return $this->getNamespace() === -1; // NS_SPECIAL; } /** * Use the default special page alias. * * @param SiteConfig $siteConfig * @param string $title * @return string */ public static function fixSpecialName( SiteConfig $siteConfig, string $title ): string { $parts = explode( '/', $title, 2 ); $specialName = $siteConfig->specialPageLocalName( $parts[0] ); if ( $specialName !== null ) { $parts[0] = $specialName; $title = implode( '/', $parts ); } return $title; } /** * Create a new LinkTarget with a different fragment on the same page. * * It is expected that the same type of object will be returned, but the * only requirement is that it is a LinkTarget. * * @param string $fragment The fragment override, or "" to remove it. * * @return self */ public function createFragmentTarget( string $fragment ) { return new self( $this->interwiki, $this->dbkey, $this->namespaceId, $this->namespaceName, $fragment ?: null ); } /** * Convert LinkTarget from core (or other implementation) into a * Parsoid Title. * * @param LinkTarget $linkTarget * @return self */ public static function newFromLinkTarget( LinkTarget $linkTarget, SiteConfig $siteConfig ) { if ( $linkTarget instanceof Title ) { return $linkTarget; } $ns = $linkTarget->getNamespace(); $namespaceName = $siteConfig->namespaceName( $ns ); Assert::invariant( $namespaceName !== null, "Badtitle ({$linkTarget}) in unknown namespace ({$ns})" ); return new self( $linkTarget->getInterwiki(), $linkTarget->getDBkey(), $linkTarget->getNamespace(), $namespaceName, $linkTarget->getFragment() ); } } PK ! N��H� � DTState.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Utils; use stdClass; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\Wt2Html\PegTokenizer; /** * State carried while DOM Traversing. * * FIXME: As it stands, DTState cannot be constructed outside of Parsoid. * However, extensions and core code might benefit from a non-Parsoid-specific * state object that DOMTraverser users outside of Parsoid could use. */ class DTState { public Env $env; public array $options; public bool $atTopLevel; public ?stdClass $tplInfo = null; public array $abouts = []; public array $seenIds = []; public array $usedIdIndex = []; public ?PegTokenizer $tokenizer = null; // Needed by TableFixups handlers public function __construct( Env $env, array $options = [], bool $atTopLevel = false ) { $this->env = $env; $this->options = $options; $this->atTopLevel = $atTopLevel; } } PK ! ܍�.�k �k DOMUtils.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Utils; use Wikimedia\Assert\Assert; use Wikimedia\Parsoid\Core\ClientError; use Wikimedia\Parsoid\DOM\Comment; use Wikimedia\Parsoid\DOM\Document; use Wikimedia\Parsoid\DOM\DocumentFragment; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\DOM\Text; use Wikimedia\Parsoid\Wikitext\Consts; use Wikimedia\Parsoid\Wt2Html\XMLSerializer; use Wikimedia\RemexHtml\DOM\DOMBuilder; use Wikimedia\RemexHtml\Tokenizer\Tokenizer; use Wikimedia\RemexHtml\TreeBuilder\Dispatcher; use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder; /** * DOM utilities for querying the DOM. This is largely independent of Parsoid * although some Parsoid details (TokenUtils, inline content version) * have snuck in. */ class DOMUtils { /** * Parse HTML, return the tree. * * @param string $html * @param bool $validateXMLNames * @return Document */ public static function parseHTML( string $html, bool $validateXMLNames = false ): Document { if ( !preg_match( '/^<(?:!doctype|html|body)/i', $html ) ) { // Make sure that we parse fragments in the body. Otherwise comments, // link and meta tags end up outside the html element or in the head // elements. $html = '<body>' . $html; } $domBuilder = new class( [ 'suppressHtmlNamespace' => true, ] ) extends DOMBuilder { /** @inheritDoc */ protected function createDocument( ?string $doctypeName = null, ?string $public = null, ?string $system = null ) { // @phan-suppress-next-line PhanTypeMismatchReturn return DOMCompat::newDocument( false ); } }; $treeBuilder = new TreeBuilder( $domBuilder, [ 'ignoreErrors' => true ] ); $dispatcher = new Dispatcher( $treeBuilder ); $tokenizer = new Tokenizer( $dispatcher, $html, [ 'ignoreErrors' => true ] ); $tokenizer->execute( [] ); if ( $validateXMLNames && $domBuilder->isCoerced() ) { throw new ClientError( 'Encountered a name invalid in XML.' ); } $frag = $domBuilder->getFragment(); '@phan-var Document $frag'; // @var Document $frag return $frag; } /** * This is a simplified version of the DOMTraverser. * Consider using that before making this more complex. * * FIXME: Move to DOMTraverser OR create a new class? * @param Node $node * @param callable $handler * @param mixed ...$args */ public static function visitDOM( Node $node, callable $handler, ...$args ): void { $handler( $node, ...$args ); $node = $node->firstChild; while ( $node ) { $next = $node->nextSibling; self::visitDOM( $node, $handler, ...$args ); $node = $next; } } /** * Move 'from'.childNodes to 'to' adding them before 'beforeNode' * If 'beforeNode' is null, the nodes are appended at the end. * @param Node $from Source node. Children will be removed. * @param Node $to Destination node. Children of $from will be added here * @param ?Node $beforeNode Add the children before this node. */ public static function migrateChildren( Node $from, Node $to, ?Node $beforeNode = null ): void { while ( $from->firstChild ) { $to->insertBefore( $from->firstChild, $beforeNode ); } } /** * Copy 'from'.childNodes to 'to' adding them before 'beforeNode' * 'from' and 'to' belong to different documents. * * If 'beforeNode' is null, the nodes are appended at the end. * @param Node $from * @param Node $to * @param ?Node $beforeNode */ public static function migrateChildrenBetweenDocs( Node $from, Node $to, ?Node $beforeNode = null ): void { $n = $from->firstChild; $destDoc = $to->ownerDocument; while ( $n ) { $to->insertBefore( $destDoc->importNode( $n, true ), $beforeNode ); $n = $n->nextSibling; } } // phpcs doesn't like @phan-assert... // phpcs:disable MediaWiki.Commenting.FunctionAnnotations.UnrecognizedAnnotation /** * Assert that this is a DOM element node. * This is primarily to help phan analyze variable types. * @phan-assert Element $node * @param ?Node $node * @return bool Always returns true * @phan-assert Element $node */ public static function assertElt( ?Node $node ): bool { Assert::invariant( $node instanceof Element, "Expected an element" ); return true; } /** * @param ?Node $node * @return bool */ public static function isRemexBlockNode( ?Node $node ): bool { return $node instanceof Element && !isset( Consts::$HTML['OnlyInlineElements'][DOMCompat::nodeName( $node )] ) && // This is a superset of \\MediaWiki\Tidy\RemexCompatMunger::$metadataElements !self::isMetaDataTag( $node ); } /** * @param ?Node $node * @return bool */ public static function isWikitextBlockNode( ?Node $node ): bool { return $node && TokenUtils::isWikitextBlockTag( DOMCompat::nodeName( $node ) ); } /** * Determine whether this is a formatting DOM element. * @param ?Node $node * @return bool */ public static function isFormattingElt( ?Node $node ): bool { return $node && isset( Consts::$HTML['FormattingTags'][DOMCompat::nodeName( $node )] ); } /** * Determine whether this is a quote DOM element. * @param ?Node $node * @return bool */ public static function isQuoteElt( ?Node $node ): bool { return $node && isset( Consts::$WTQuoteTags[DOMCompat::nodeName( $node )] ); } /** * Determine whether this is the <body> DOM element. * @param ?Node $node * @return bool */ public static function isBody( ?Node $node ): bool { return $node && DOMCompat::nodeName( $node ) === 'body'; } /** * Determine whether this is a removed DOM node but Node object yet * @param ?Node $node * @return bool */ public static function isRemoved( ?Node $node ): bool { return !$node || !isset( $node->nodeType ); } /** * Build path from a node to the root of the document. * * @param Node $node * @return Node[] Path including all nodes from $node to the root of the document */ public static function pathToRoot( Node $node ): array { $path = []; do { $path[] = $node; // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition } while ( $node = $node->parentNode ); return $path; } /** * Compute the edge length of the path from $node to the root. * Root document is at depth 0, <html> at 1, <body> at 2. * @param Node $node * @return int */ public static function nodeDepth( Node $node ): int { $edges = 0; // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition while ( $node = $node->parentNode ) { $edges++; } return $edges; } /** * Build path from a node to its passed-in sibling. * Return will not include the passed-in sibling. * * @param Node $node * @param Node $sibling * @param bool $left indicates whether to go backwards, use previousSibling instead of nextSibling. * @return Node[] */ public static function pathToSibling( Node $node, Node $sibling, bool $left ): array { $path = []; while ( $node && $node !== $sibling ) { $path[] = $node; $node = $left ? $node->previousSibling : $node->nextSibling; } return $path; } /** * Check whether a node `n1` comes before another node `n2` in * their parent's children list. * * @param Node $n1 The node you expect to come first. * @param Node $n2 Expected later sibling. * @return bool */ public static function inSiblingOrder( Node $n1, Node $n2 ): bool { while ( $n1 && $n1 !== $n2 ) { $n1 = $n1->nextSibling; } return $n1 !== null; } /** * Check that a node 'n1' is an ancestor of another node 'n2' in * the DOM. Returns true if n1 === n2. * $n1 is the suspected ancestor. * $n2 The suspected descendant. * * @param Node $n1 * @param Node $n2 * @return bool */ public static function isAncestorOf( Node $n1, Node $n2 ): bool { while ( $n2 && $n2 !== $n1 ) { $n2 = $n2->parentNode; } return $n2 !== null; } /** * Find an ancestor of $node with nodeName $name. * * @param Node $node * @param string $name * @return ?Element */ public static function findAncestorOfName( Node $node, string $name ): ?Element { $node = $node->parentNode; while ( $node && DOMCompat::nodeName( $node ) !== $name ) { $node = $node->parentNode; } '@phan-var Element $node'; // @var Element $node return $node; } /** * Check whether $node has $name or has an ancestor named $name. * * @param Node $node * @param string $name * @return bool */ public static function hasNameOrHasAncestorOfName( Node $node, string $name ): bool { return DOMCompat::nodeName( $node ) === $name || self::findAncestorOfName( $node, $name ) !== null; } /** * Determine whether the node matches the given nodeName and attribute value. * Returns true if node name matches and the attribute equals "typeof" * * @param Node $n The node to test * @param string $name The expected nodeName of $n * @param string $typeRe Regular expression matching the expected value of * `typeof` attribute. * @return ?string The matching `typeof` value, or `null` if there is * no match. */ public static function matchNameAndTypeOf( Node $n, string $name, string $typeRe ): ?string { return DOMCompat::nodeName( $n ) === $name ? self::matchTypeOf( $n, $typeRe ) : null; } /** * Determine whether the node matches the given nodeName and typeof * attribute value; the typeof is given as string. * * @param Node $n * @param string $name node name to test for * @param string $type Expected value of "typeof" attribute (literal string) * @return bool True if the node matches. */ public static function hasNameAndTypeOf( Node $n, string $name, string $type ): bool { return self::matchNameAndTypeOf( $n, $name, '/^' . preg_quote( $type, '/' ) . '$/' ) !== null; } /** * Determine whether the node matches the given `typeof` attribute value. * * @param Node $n The node to test * @param string $typeRe Regular expression matching the expected value of * the `typeof` attribute. * @return ?string The matching `typeof` value, or `null` if there is * no match. */ public static function matchTypeOf( Node $n, string $typeRe ): ?string { return self::matchMultivalAttr( $n, 'typeof', $typeRe ); } /** * Determine whether the node matches the given `rel` attribute value. * * @param Node $n The node to test * @param string $relRe Regular expression matching the expected value of * the `rel` attribute. * @return ?string The matching `rel` value, or `null` if there is * no match. */ public static function matchRel( Node $n, string $relRe ): ?string { return self::matchMultivalAttr( $n, 'rel', $relRe ); } /** * Determine whether the node matches the given multivalue attribute value. * * @param Node $n The node to test * @param string $attrName the attribute to test (typically 'rel' or 'typeof') * @param string $valueRe Regular expression matching the expected value of * the attribute. * @return ?string The matching attribute value, or `null` if there is * no match. */ private static function matchMultivalAttr( Node $n, string $attrName, string $valueRe ): ?string { if ( !( $n instanceof Element ) ) { return null; } $attrValue = DOMCompat::getAttribute( $n, $attrName ); if ( $attrValue === null || $attrValue === '' ) { return null; } foreach ( explode( ' ', $attrValue ) as $ty ) { if ( $ty === '' ) { continue; } $count = preg_match( $valueRe, $ty ); Assert::invariant( $count !== false, "Bad regexp" ); if ( $count ) { return $ty; } } return null; } /** * Determine whether the node matches the given typeof attribute value. * * @param Node $n * @param string $type Expected value of "typeof" attribute, as a literal * string. * @return bool True if the node matches. */ public static function hasTypeOf( Node $n, string $type ): bool { return self::hasValueInMultivalAttr( $n, 'typeof', $type ); } /** * Determine whether the node matches the given rel attribute value. * * @param Node $n * @param string $rel Expected value of "rel" attribute, as a literal string. * @return bool True if the node matches. */ public static function hasRel( Node $n, string $rel ): bool { return self::hasValueInMultivalAttr( $n, 'rel', $rel ); } /** * @param Element $element * @param string $regex Partial regular expression, e.g. "foo|bar" * @return bool */ public static function hasClass( Element $element, string $regex ): bool { $value = DOMCompat::getAttribute( $element, 'class' ); return (bool)preg_match( '{(?<=^|\s)' . $regex . '(?=\s|$)}', $value ?? '' ); } /** * Determine whether the node matches the given attribute value for a multivalued attribute * @param Node $n * @param string $attrName name of the attribute to check (typically 'typeof', 'rel') * @param string $value Expected value of $attrName" attribute, as a literal string. * @return bool True if the node matches */ private static function hasValueInMultivalAttr( Node $n, string $attrName, string $value ): bool { // fast path if ( !( $n instanceof Element ) ) { return false; } $attrValue = DOMCompat::getAttribute( $n, $attrName ); if ( $attrValue === null || $attrValue === '' ) { return false; } if ( $attrValue === $value ) { return true; } // fallback return in_array( $value, explode( ' ', $attrValue ), true ); } /** * Add a type to the typeof attribute. This method should almost always * be used instead of `setAttribute`, to ensure we don't overwrite existing * typeof information. * * @param Element $node node * @param string $type type * @param bool $prepend If true, adds value to start, rather than end. * Use of this option in new code is discouraged. */ public static function addTypeOf( Element $node, string $type, bool $prepend = false ): void { self::addValueToMultivalAttr( $node, 'typeof', $type, $prepend ); } /** * Add a type to the rel attribute. This method should almost always * be used instead of `setAttribute`, to ensure we don't overwrite existing * rel information. * * @param Element $node node * @param string $rel type */ public static function addRel( Element $node, string $rel ): void { self::addValueToMultivalAttr( $node, 'rel', $rel ); } /** * Add an element to a multivalue attribute (typeof, rel). This method should almost always * be used instead of `setAttribute`, to ensure we don't overwrite existing * multivalue information. * * @param Element $node * @param string $attr * @param string $value * @param bool $prepend If true, adds value to start, rather than end */ private static function addValueToMultivalAttr( Element $node, string $attr, string $value, bool $prepend = false ): void { $value = trim( $value ); if ( $value === '' ) { return; } $oldValue = DOMCompat::getAttribute( $node, $attr ); if ( $oldValue !== null && trim( $oldValue ) !== '' ) { $values = explode( ' ', trim( $oldValue ) ); if ( in_array( $value, $values, true ) ) { return; } $value = $prepend ? "$value $oldValue" : "$oldValue $value"; } $node->setAttribute( $attr, $value ); } /** * Remove a value from a multiple-valued attribute. * * @param Element $node node * @param string $attr The attribute name * @param string $value The value to remove */ private static function removeValueFromMultivalAttr( Element $node, string $attr, string $value ): void { $oldValue = DOMCompat::getAttribute( $node, $attr ); if ( $oldValue !== null && $oldValue !== '' ) { $value = trim( $value ); $types = array_diff( explode( ' ', $oldValue ), [ $value ] ); if ( count( $types ) > 0 ) { $node->setAttribute( $attr, implode( ' ', $types ) ); } else { $node->removeAttribute( $attr ); } } } /** * Remove a type from the typeof attribute. * * @param Element $node node * @param string $type type */ public static function removeTypeOf( Element $node, string $type ): void { self::removeValueFromMultivalAttr( $node, 'typeof', $type ); } /** * Remove a type from the rel attribute. * * @param Element $node node * @param string $rel rel */ public static function removeRel( Element $node, string $rel ): void { self::removeValueFromMultivalAttr( $node, 'rel', $rel ); } /** * Check whether `node` is in a fosterable position. * * @param ?Node $n * @return bool */ public static function isFosterablePosition( ?Node $n ): bool { return $n && isset( Consts::$HTML['FosterablePosition'][DOMCompat::nodeName( $n->parentNode )] ); } /** * Check whether `node` is a heading. * * @param ?Node $n * @return bool */ public static function isHeading( ?Node $n ): bool { return $n && preg_match( '/^h[1-6]$/D', DOMCompat::nodeName( $n ) ); } /** * Check whether `node` is a list. * * @param ?Node $n * @return bool */ public static function isList( ?Node $n ): bool { return $n && isset( Consts::$HTML['ListTags'][DOMCompat::nodeName( $n )] ); } /** * Check whether `node` is a list item. * * @param ?Node $n * @return bool */ public static function isListItem( ?Node $n ): bool { return $n && isset( Consts::$HTML['ListItemTags'][DOMCompat::nodeName( $n )] ); } /** * Check whether `node` is a list or list item. * * @param ?Node $n * @return bool */ public static function isListOrListItem( ?Node $n ): bool { return self::isList( $n ) || self::isListItem( $n ); } /** * Check whether `node` is nestee in a list item. * * @param ?Node $n * @return bool */ public static function isNestedInListItem( ?Node $n ): bool { $parentNode = $n->parentNode; while ( $parentNode ) { if ( self::isListItem( $parentNode ) ) { return true; } $parentNode = $parentNode->parentNode; } return false; } /** * Check whether `node` is a nested list or a list item. * * @param ?Node $n * @return bool */ public static function isNestedListOrListItem( ?Node $n ): bool { return self::isListOrListItem( $n ) && self::isNestedInListItem( $n ); } /** * Check a node to see whether it's a meta with some typeof. * * @param Node $n * @param string $type * @return bool */ public static function isMarkerMeta( Node $n, string $type ): bool { return self::hasNameAndTypeOf( $n, 'meta', $type ); } /** * Check whether a node has any children that are elements. * * @param Node $node * @return bool */ public static function hasElementChild( Node $node ): bool { for ( $child = $node->firstChild; $child; $child = $child->nextSibling ) { if ( $child instanceof Element ) { return true; } } return false; } /** * Check if a node has a block-level element descendant. * * @param Node $node * @return bool */ public static function hasBlockElementDescendant( Node $node ): bool { for ( $child = $node->firstChild; $child; $child = $child->nextSibling ) { if ( $child instanceof Element && ( self::isWikitextBlockNode( $child ) || // Is a block-level node self::hasBlockElementDescendant( $child ) ) // or has a block-level child or grandchild or.. ) { return true; } } return false; } /** * Is a node representing inter-element whitespace? * * @param ?Node $node * @return bool */ public static function isIEW( ?Node $node ): bool { // ws-only return $node instanceof Text && preg_match( '/^\s*$/D', $node->nodeValue ); } /** * Is a node a document fragment? * * @param ?Node $node * @return bool */ public static function isDocumentFragment( ?Node $node ): bool { return $node && $node->nodeType === XML_DOCUMENT_FRAG_NODE; } /** * Is a node at the top? * * @param ?Node $node * @return bool */ public static function atTheTop( ?Node $node ): bool { return self::isDocumentFragment( $node ) || self::isBody( $node ); } /** * Are all children of this node text or comment nodes? * * @param Node $node * @return bool */ public static function allChildrenAreTextOrComments( Node $node ): bool { $child = $node->firstChild; while ( $child ) { if ( !( $child instanceof Text || $child instanceof Comment ) ) { return false; } $child = $child->nextSibling; } return true; } /** * Check if the dom-subtree rooted at node has an element with tag name 'tagName' * By default, the root node is not checked. * * @param Node $node The DOM node whose tree should be checked * @param string $tagName Tag name to look for * @param bool $checkRoot Should the root be checked? * @return bool */ public static function treeHasElement( Node $node, string $tagName, bool $checkRoot = false ): bool { if ( $checkRoot && DOMCompat::nodeName( $node ) === $tagName ) { return true; } $node = $node->firstChild; while ( $node ) { if ( $node instanceof Element ) { if ( self::treeHasElement( $node, $tagName, true ) ) { return true; } } $node = $node->nextSibling; } return false; } /** * Is node a table tag (table, tbody, td, tr, etc.)? * * @param Node $node * @return bool */ public static function isTableTag( Node $node ): bool { return isset( Consts::$HTML['TableTags'][DOMCompat::nodeName( $node )] ); } /** * Returns a media element nested in `node` * * @param Element $node * @return Element|null */ public static function selectMediaElt( Element $node ): ?Element { return DOMCompat::querySelector( $node, 'img, video, audio' ); } /** * Extract http-equiv headers from the HTML, including content-language and * vary headers, if present * * @param Document $doc * @return array<string,string> */ public static function findHttpEquivHeaders( Document $doc ): array { $elts = DOMCompat::querySelectorAll( $doc, 'meta[http-equiv][content]' ); $r = []; foreach ( $elts as $el ) { $r[strtolower( DOMCompat::getAttribute( $el, 'http-equiv' ) )] = DOMCompat::getAttribute( $el, 'content' ); } return $r; } /** * Add or replace http-equiv headers in the HTML <head>. * This is used for content-language and vary headers, among possible * others. * @param Document $doc The HTML document to update * @param array<string,string|string[]> $headers An array mapping HTTP * header names (which are case-insensitive) to new values. If an * array of values is provided, they will be joined with commas. */ public static function addHttpEquivHeaders( Document $doc, array $headers ): void { foreach ( $headers as $key => $value ) { if ( is_array( $value ) ) { $value = implode( ',', $value ); } // HTTP header names are case-insensitive; hence the "i" suffix // on this selector query. $el = DOMCompat::querySelector( $doc, "meta[http-equiv=\"{$key}\"i]" ); if ( !$el ) { // This also ensures there is a <head> element. $el = self::appendToHead( $doc, 'meta', [ 'http-equiv' => $key ] ); } $el->setAttribute( 'content', $value ); } } /** * @param Document $doc * @return string|null */ public static function extractInlinedContentVersion( Document $doc ): ?string { $el = DOMCompat::querySelector( $doc, 'meta[property="mw:htmlVersion"], meta[property="mw:html:version"]' ); return $el ? DOMCompat::getAttribute( $el, 'content' ) : null; } /** * Add attributes to a node element. * * @param Element $elt element * @param array $attrs attributes */ public static function addAttributes( Element $elt, array $attrs ): void { foreach ( $attrs as $key => $value ) { if ( $value !== null ) { if ( $key === 'id' ) { DOMCompat::setIdAttribute( $elt, $value ); } else { $elt->setAttribute( $key, $value ); } } } } /** * Create an element in the document head with the given attrs. * Creates the head element in the document if needed. * * @param Document $document * @param string $tagName * @param array $attrs * @return Element The newly-appended Element */ public static function appendToHead( Document $document, string $tagName, array $attrs = [] ): Element { $elt = $document->createElement( $tagName ); self::addAttributes( $elt, $attrs ); $head = DOMCompat::getHead( $document ); if ( !$head ) { $head = $document->createElement( 'head' ); $document->documentElement->insertBefore( $head, DOMCompat::getBody( $document ) ); } $head->appendChild( $elt ); return $elt; } /** * innerHTML and outerHTML are not defined on DocumentFragment. * * Defined similarly to DOMCompat::getInnerHTML() * * @param DocumentFragment $frag * @return string */ public static function getFragmentInnerHTML( DocumentFragment $frag ): string { return XMLSerializer::serialize( $frag, [ 'innerXML' => true ] )['html']; } /** * innerHTML and outerHTML are not defined on DocumentFragment. * @see DOMCompat::setInnerHTML() for the Element version * * @param DocumentFragment $frag * @param string $html */ public static function setFragmentInnerHTML( DocumentFragment $frag, string $html ): void { // FIXME: This should be an HTML5 template element $body = $frag->ownerDocument->createElement( 'body' ); DOMCompat::setInnerHTML( $body, $html ); self::migrateChildren( $body, $frag ); } /** * @param Document $doc * @param string $html * @return DocumentFragment */ public static function parseHTMLToFragment( Document $doc, string $html ): DocumentFragment { $frag = $doc->createDocumentFragment(); self::setFragmentInnerHTML( $frag, $html ); return $frag; } /** * @param Node $node * @return bool */ public static function isRawTextElement( Node $node ): bool { return isset( Consts::$HTML['RawTextElements'][DOMCompat::nodeName( $node )] ); } /** * Is 'n' a block tag, or does the subtree rooted at 'n' have a block tag * in it? * * @param Node $n * @return bool */ public static function hasBlockTag( Node $n ): bool { if ( self::isRemexBlockNode( $n ) ) { return true; } $c = $n->firstChild; while ( $c ) { if ( self::hasBlockTag( $c ) ) { return true; } $c = $c->nextSibling; } return false; } /** * Get an associative array of attributes, suitable for serialization. * * Add the xmlns attribute if available, to workaround PHP's surprising * behavior with the xmlns attribute: HTML is *not* an XML document, * but various parts of PHP (including our misnamed XMLSerializer) pretend * that it is, sort of. * * @param Element $element * @return array<string,string> * @see https://phabricator.wikimedia.org/T235295 */ public static function attributes( Element $element ): array { $result = []; // The 'xmlns' attribute is "invisible" T235295 $xmlns = DOMCompat::getAttribute( $element, 'xmlns' ); if ( $xmlns !== null ) { $result['xmlns'] = $xmlns; } foreach ( $element->attributes as $attr ) { $result[$attr->name] = $attr->value; } return $result; } /** * @param Element $node * @return bool */ public static function isMetaDataTag( Element $node ): bool { return isset( Consts::$HTML['MetaDataTags'][DOMCompat::nodeName( $node )] ); } /** * Strip a paragraph wrapper, if any, before parsing HTML to DOM */ public static function stripPWrapper( string $ret ): string { return preg_replace( '#(^<p>)|(\n</p>(' . Utils::COMMENT_REGEXP_FRAGMENT . '|\s)*$)#D', '', $ret ); } } PK ! g���� � DiffDOMUtils.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Utils; use Wikimedia\Parsoid\DOM\Comment; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\DOM\Text; use Wikimedia\Parsoid\Html2Wt\DiffUtils; /** * Some diff marker aware DOM utils. */ class DiffDOMUtils { /** * Test the number of children this node has without using * `DOMNode::$childNodes->count()`. This walks the sibling list and so * takes O(`nchildren`) time -- so `nchildren` is expected to be small * (say: 0, 1, or 2). * * Skips all diff markers by default. * @param Node $node * @param int $nchildren * @param bool $countDiffMarkers * @return bool */ public static function hasNChildren( Node $node, int $nchildren, bool $countDiffMarkers = false ): bool { for ( $child = $node->firstChild; $child; $child = $child->nextSibling ) { if ( !$countDiffMarkers && DiffUtils::isDiffMarker( $child ) ) { continue; } if ( $nchildren <= 0 ) { return false; } $nchildren -= 1; } return ( $nchildren === 0 ); } /** * Is a node a content node? * * @param ?Node $node * @return bool */ public static function isContentNode( ?Node $node ): bool { return !( $node instanceof Comment ) && !DOMUtils::isIEW( $node ) && !DiffUtils::isDiffMarker( $node ); } /** * Get the first child element or non-IEW text node, ignoring * whitespace-only text nodes, comments, and deleted nodes. * * @param Node $node * @return Node|null */ public static function firstNonSepChild( Node $node ): ?Node { $child = $node->firstChild; while ( $child && !self::isContentNode( $child ) ) { $child = $child->nextSibling; } return $child; } /** * Get the last child element or non-IEW text node, ignoring * whitespace-only text nodes, comments, and deleted nodes. * * @param Node $node * @return Node|null */ public static function lastNonSepChild( Node $node ): ?Node { $child = $node->lastChild; while ( $child && !self::isContentNode( $child ) ) { $child = $child->previousSibling; } return $child; } /** * Get the previous non separator sibling node. * * @param Node $node * @return Node|null */ public static function previousNonSepSibling( Node $node ): ?Node { $prev = $node->previousSibling; while ( $prev && !self::isContentNode( $prev ) ) { $prev = $prev->previousSibling; } return $prev; } /** * Get the next non separator sibling node. * * @param Node $node * @return Node|null */ public static function nextNonSepSibling( Node $node ): ?Node { $next = $node->nextSibling; while ( $next && !self::isContentNode( $next ) ) { $next = $next->nextSibling; } return $next; } /** * Return the numbler of non deleted child nodes. * * @param Node $node * @return int */ public static function numNonDeletedChildNodes( Node $node ): int { $n = 0; $child = $node->firstChild; while ( $child ) { if ( !DiffUtils::isDiffMarker( $child ) ) { // FIXME: This is ignoring both inserted/deleted $n++; } $child = $child->nextSibling; } return $n; } /** * Get the first non-deleted child of node. * * @param Node $node * @return Node|null */ public static function firstNonDeletedChild( Node $node ): ?Node { $child = $node->firstChild; // FIXME: This is ignoring both inserted/deleted while ( $child && DiffUtils::isDiffMarker( $child ) ) { $child = $child->nextSibling; } return $child; } /** * Get the last non-deleted child of node. * * @param Node $node * @return Node|null */ public static function lastNonDeletedChild( Node $node ): ?Node { $child = $node->lastChild; // FIXME: This is ignoring both inserted/deleted while ( $child && DiffUtils::isDiffMarker( $child ) ) { $child = $child->previousSibling; } return $child; } /** * Get the next non deleted sibling. * * @param Node $node * @return Node|null */ public static function nextNonDeletedSibling( Node $node ): ?Node { $node = $node->nextSibling; while ( $node && DiffUtils::isDiffMarker( $node ) ) { // FIXME: This is ignoring both inserted/deleted $node = $node->nextSibling; } return $node; } /** * Get the previous non deleted sibling. * * @param Node $node * @return Node|null */ public static function previousNonDeletedSibling( Node $node ): ?Node { $node = $node->previousSibling; while ( $node && DiffUtils::isDiffMarker( $node ) ) { // FIXME: This is ignoring both inserted/deleted $node = $node->previousSibling; } return $node; } /** * Does `node` contain nothing or just non-newline whitespace? * `strict` adds the condition that all whitespace is forbidden. * * @param Node $node * @param bool $strict * @return bool */ public static function nodeEssentiallyEmpty( Node $node, bool $strict = false ): bool { $n = $node->firstChild; while ( $n ) { if ( $n instanceof Element && !DiffUtils::isDiffMarker( $n ) ) { return false; } elseif ( $n instanceof Text && ( $strict || !preg_match( '/^[ \t]*$/D', $n->nodeValue ) ) ) { return false; } elseif ( $n instanceof Comment ) { return false; } $n = $n->nextSibling; } return true; } } PK ! %O�Aq q ConfigUtils.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Utils; /** * This refactors common code in Api and Mock based config computation */ class ConfigUtils { /** * Compute the interwiki map based on raw data (either manually * configured or obtaianed via an API) * * @param array $iwData * @return array */ public static function computeInterwikiMap( array $iwData ): array { $interwikiMap = []; $keys = [ 'prefix' => true, 'url' => true, 'protorel' => true, 'local' => true, 'localinterwiki' => true, 'language' => true, 'extralanglink' => true, 'linktext' => true, ]; $cb = static function ( $v ) { return $v !== false; }; foreach ( $iwData as $iwEntry ) { $iwEntry['language'] = isset( $iwEntry['language'] ); // Fix up broken interwiki hrefs that are missing a $1 placeholder // Just append the placeholder at the end. // This makes sure that the interwikiMatcher adds one match // group per URI, and that interwiki links work as expected. // Not sure why Phan thinks $iwEntry['url'] is a bool // @phan-suppress-next-line PhanTypeMismatchArgumentInternal if ( strpos( $iwEntry['url'], '$1' ) === false ) { $iwEntry['url'] .= '$1'; } $iwEntry = array_intersect_key( $iwEntry, $keys ); $interwikiMap[$iwEntry['prefix']] = array_filter( $iwEntry, $cb ); } return $interwikiMap; } } PK ! �S�Z8 8 Timing.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Utils; use Wikimedia\Assert\Assert; use Wikimedia\Parsoid\Config\SiteConfig; /** * A helper class to make it easier to compute timing metrics. */ class Timing { /** * This is typically a StatsdDataFactoryInterface, but really could be * anything which has a `timing()` method. Set it to `null` to disable * metrics. * * @var ?object */ private ?object $metrics; /** * @var float */ private float $startTime; /** * @var ?SiteConfig */ private ?SiteConfig $siteConfig; private ?float $elapsed; private function __construct( ?object $configOrMetrics, ?float $elapsed = null ) { if ( $configOrMetrics instanceof SiteConfig ) { $this->siteConfig = $configOrMetrics; $this->metrics = $configOrMetrics->metrics(); } else { $this->siteConfig = null; $this->metrics = $configOrMetrics; } $this->startTime = self::millis(); $this->elapsed = $elapsed; } /** * Return the current number of milliseconds since the epoch, as a float. */ public static function millis(): float { return 1000 * microtime( true ); } /** * End this timing measurement, reporting it under the given `name`. * @param ?string $statsdCompat * @param ?string $name * @param ?array $labels * @return float Number of milliseconds reported */ public function end( ?string $statsdCompat = null, ?string $name = null, ?array $labels = [] ): float { if ( $this->elapsed === null ) { $this->elapsed = self::millis() - $this->startTime; } if ( $this->metrics ) { Assert::invariant( $statsdCompat !== null, 'Recording metric without a key.' ); $this->metrics->timing( $statsdCompat, $this->elapsed ); } if ( $this->siteConfig ) { // Note that observeTiming takes a value in *milliseconds* // despite the name of the metric ending in `_seconds` $this->siteConfig->observeTiming( $name, $this->elapsed, $labels ); } return $this->elapsed; } /** * Override elapsed time of a timing instance * @param SiteConfig $siteConfig * @param float $value Value to measure in the metrics (milliseconds if timing) * @return Timing */ public static function fakeTiming( SiteConfig $siteConfig, float $value ): Timing { return new Timing( $siteConfig, $value ); } /** * Start a timing measurement, logging it to the given `$metrics` object * (which just needs to have a `timing()` method). * @param ?object $configOrMetrics * @return Timing */ public static function start( ?object $configOrMetrics = null ): Timing { return new Timing( $configOrMetrics ); } } PK ! �G�ܳ0 �0 ScriptUtils.phpnu �Iw�� <?php declare( strict_types = 1 ); /** * This file contains general utilities for scripts in * the bin/, tools/, tests/ directories. This file should * not contain any helpers that are needed by code in the * lib/ directory. */ namespace Wikimedia\Parsoid\Utils; class ScriptUtils { /** * Split a tracing / debugging flag string into individual flags * and return them as an associative array with flags as keys and true as value. * * @param string $origFlag The original flag string. * @return array */ private static function fetchFlagsMap( string $origFlag ): array { $objFlags = explode( ',', $origFlag ); if ( in_array( 'selser', $objFlags, true ) && !in_array( 'wts', $objFlags, true ) ) { $objFlags[] = 'wts'; } return array_fill_keys( $objFlags, true ); } /** * @return-taint none */ private static function getScriptName(): string { return basename( $_SERVER["SCRIPT_FILENAME"] ); } /** * Returns a help message for the tracing flags. * * @return string */ public static function traceUsageHelp(): string { $script = self::getScriptName(); return implode( "\n", [ 'Tracing', '-------', '- With one or more comma-separated flags, traces those specific phases', '- Supported flags:', ' * peg : shows tokens emitted by tokenizer', ' * ttm:2 : shows tokens flowing through stage 2 of the parsing pipeline', ' * ttm:3 : shows tokens flowing through stage 3 of the parsing pipeline', ' * tsp : shows tokens flowing through the TokenStreamPatcher ' . '(useful to see in-order token stream)', ' * list : shows actions of the list handler', ' * sanitizer : shows actions of the sanitizer', ' * pre : shows actions of the pre handler', ' * p-wrap : shows actions of the paragraph wrapper', ' * html : shows tokens that are sent to the HTML tree builder', ' * remex : shows RemexHtml\'s tree mutation events', ' * dsr : shows dsr computation on the DOM', ' * tplwrap : traces template wrapping code (currently only range overlap/nest/merge code)', ' * wts : trace actions of the regular wikitext serializer', ' * selser : trace actions of the selective serializer', ' * domdiff : trace actions of the DOM diffing code', ' * wt-escape : debug wikitext-escaping', ' * apirequest: trace all API requests', ' * time : trace times for various phases', '', '--debug enables tracing of all the above phases except Token Transform Managers', '', 'Examples:', "$ php $script --trace pre,p-wrap,html < foo", "$ php $script --trace ttm:3,dsr < foo", '' ] ); } /** * Returns a help message for the dump flags. * * @return string */ public static function dumpUsageHelp(): string { $script = self::getScriptName(); return implode( "\n", [ 'Dumping state', '-------------', '- Dumps state at different points of execution', '- DOM dumps are always doc.outerHTML', '- Supported flags:', '', ' * tplsrc : dumps preprocessed template source that will be tokenized ' . '(via ?action=expandtemplates)', ' * extoutput : dumps HTML output form extensions (via ?action=parse)', '', ' --- Dump flags for wt2html DOM passes ---', ' * dom:pre-XXX : dumps DOM before pass XXX runs', ' * dom:pre-* : dumps DOM before every pass', ' * dom:post-XXX : dumps DOM after pass XXX runs', ' * dom:post-* : dumps DOM after every pass', '', ' Available passes (in the order they run):', '', ' fostered, process-fixups, Normalize, pwrap, ', ' media, migrate-metas, migrate-nls, dsr, tplwrap, ', ' dom-unpack, pp:EXT (replace EXT with extension: Pre, Gallery, etc)', ' fixups, strip-metas, lang-converter, redlinks, ', ' displayspace, linkclasses, sections, convertoffsets', ' i18n, cleanup', '', ' --- Dump flags for html2wt ---', ' * dom:post-dom-diff : in selective serialization, dumps DOM after running dom diff', ' * dom:post-normal : in serialization, dumps DOM after normalization', " * wt2html:limits : dumps used resources (along with configured limits)\n", "--debug dumps state at these different stages\n", 'Examples:', "$ php $script --dump dom:pre-dsr,dom:pre-tplwrap < foo", "$ php $script --trace html --dump dom:pre-tplwrap < foo", "\n" ] ); } /** * Returns a help message for the debug flags. * * @return string */ public static function debugUsageHelp(): string { return implode( "\n", [ 'Debugging', '---------', '- With one or more comma-separated flags, ' . 'provides more verbose tracing than the equivalent trace flag', '- Supported flags:', ' * pre : shows actions of the pre handler', ' * wts : trace actions of the regular wikitext serializer', ' * selser : trace actions of the selective serializer' ] ); } /** * Set debugging flags on an object, based on an options object. * * @param array &$envOptions Options to be passed to the Env constructor. * @param array $cliOpts The options object to use for setting the debug flags. * @return array The modified object. */ public static function setDebuggingFlags( array &$envOptions, array $cliOpts ): array { $traceOpt = $cliOpts['trace'] ?? null; $dumpOpt = $cliOpts['dump'] ?? null; $debugOpt = $cliOpts['debug'] ?? null; // Handle the --help options $exit = false; if ( $traceOpt === 'help' ) { print self::traceUsageHelp(); $exit = true; } if ( $dumpOpt === 'help' ) { print self::dumpUsageHelp(); $exit = true; } if ( $debugOpt === 'help' ) { print self::debugUsageHelp(); $exit = true; } if ( $exit ) { die( 1 ); } // Ok, no help requested: process the options. if ( $debugOpt !== null ) { // Continue to support generic debugging. if ( $debugOpt === true ) { error_log( 'Warning: Generic debugging, not handler-specific.' ); $envOptions['debug'] = self::booleanOption( $debugOpt ); } else { // Setting --debug automatically enables --trace $envOptions['debugFlags'] = self::fetchFlagsMap( $debugOpt ); $envOptions['traceFlags'] = $envOptions['debugFlags']; } } if ( $traceOpt !== null ) { if ( $traceOpt === true ) { error_log( "Warning: Generic tracing is no longer supported. " . "Ignoring --trace flag. " . "Please provide handler-specific tracing flags, " . "e.g. '--trace pre,html5', to turn it on." ); } else { // Add any new trace flags to the list of existing trace flags (if // any were inherited from debug); otherwise, create a new list. $envOptions['traceFlags'] = array_merge( $envOptions['traceFlags'] ?? [], self::fetchFlagsMap( $traceOpt ) ); } } if ( $dumpOpt !== null ) { if ( $dumpOpt === true ) { error_log( 'Warning: Generic dumping not enabled. Please set a flag.' ); } else { $envOptions['dumpFlags'] = self::fetchFlagsMap( $dumpOpt ); } } return $envOptions; } /** * Sets templating and processing flags on an object, * based on an options object. * * @param array &$envOptions Options to be passed to the Env constructor. * @param array $cliOpts The options object to use for setting the debug flags. * @return array The modified object. */ public static function setTemplatingAndProcessingFlags( array &$envOptions, array $cliOpts ): array { $templateFlags = [ 'fetchConfig', 'fetchTemplates', 'fetchImageInfo', 'expandExtensions', 'addHTMLTemplateParameters' ]; foreach ( $templateFlags as $c ) { if ( isset( $cliOpts[$c] ) ) { $envOptions[$c] = self::booleanOption( $cliOpts[$c] ); } } if ( isset( $cliOpts['usePHPPreProcessor'] ) ) { $envOptions['usePHPPreProcessor'] = $envOptions['fetchTemplates'] && self::booleanOption( $cliOpts['usePHPPreProcessor'] ); } if ( isset( $cliOpts['maxDepth'] ) ) { $envOptions['maxDepth'] = is_numeric( $cliOpts['maxdepth'] ) ? $cliOpts['maxdepth'] : $envOptions['maxDepth']; } if ( isset( $cliOpts['apiURL'] ) ) { $envOptions['mwApis'] ??= []; $envOptions['mwApis'][] = [ 'prefix' => 'customwiki', 'uri' => $cliOpts['apiURL'] ]; } if ( isset( $cliOpts['addHTMLTemplateParameters'] ) ) { $envOptions['addHTMLTemplateParameters'] = self::booleanOption( $cliOpts['addHTMLTemplateParameters'] ); } if ( isset( $cliOpts['lint'] ) ) { $envOptions['linting'] = true; } return $envOptions; } /** * Parse a boolean option returned by our opts processor. * The strings 'false' and 'no' are also treated as false values. * This allows `--debug=no` and `--debug=false` to mean the same as * `--no-debug`. * * @param bool|string $val * a boolean, or a string naming a boolean value. * @return bool */ public static function booleanOption( $val ): bool { return $val && $val !== 'no' && $val !== 'false'; } /** * Set the color flags, based on an options object. * * @param array $options options object to use for setting the mode of the 'color' package. * - string|boolean options.color * Whether to use color. * Passing 'auto' will enable color only if stdout is a TTY device. * @suppress PhanEmptyPublicMethod */ public static function setColorFlags( array $options ): void { /** * PORT-FIXME: * if ( $options->color === 'auto' ) { * if ( !$process->stdout->isTTY ) { * $colors->mode = 'none'; * } * } elseif ( !self::booleanOption( $options->color ) ) { * $colors->mode = 'none'; * } */ } /** * PORT-FIXME: Should some of this functionality be moved to OptsProcessor directly? * * Add standard options to script-specific opts * This handles options parsed by `setDebuggingFlags`, * `setTemplatingAndProcessingFlags`, `setColorFlags`, * and standard --help options. * * The `defaults` option is optional, and lets you override * the defaults for the standard options. * * @param array $opts * @param array $defaults * @return array */ public static function addStandardOptions( array $opts, array $defaults = [] ): array { $standardOpts = [ // standard CLI options 'help' => [ 'description' => 'Show this help message', 'boolean' => true, 'default' => false, 'alias' => 'h' ], // handled by `setDebuggingFlags` 'debug' => [ 'description' => 'Provide optional flags. Use --debug=help for supported options' ], 'trace' => [ 'description' => 'Use --trace=help for supported options' ], 'dump' => [ 'description' => 'Dump state. Use --dump=help for supported options' ], // handled by `setTemplatingAndProcessingFlags` 'fetchConfig' => [ 'description' => 'Whether to fetch the wiki config from the server or use our local copy', 'boolean' => true, 'default' => true ], 'fetchTemplates' => [ 'description' => 'Whether to fetch included templates recursively', 'boolean' => true, 'default' => true ], 'fetchImageInfo' => [ 'description' => 'Whether to fetch image info via the API', 'boolean' => true, 'default' => true ], 'expandExtensions' => [ 'description' => 'Whether we should request extension tag expansions from a wiki', 'boolean' => true, 'default' => true ], 'usePHPPreProcessor' => [ 'description' => 'Whether to use the PHP preprocessor to expand templates', 'boolean' => true, 'default' => true ], 'addHTMLTemplateParameters' => [ 'description' => 'Parse template parameters to HTML and add them to template data', 'boolean' => true, 'default' => false ], 'maxdepth' => [ 'description' => 'Maximum expansion depth', 'default' => 40 ], 'apiURL' => [ 'description' => 'http path to remote API, e.g. http://en.wikipedia.org/w/api.php', 'default' => null ], // handled by `setColorFlags` 'color' => [ 'description' => 'Enable color output Ex: --no-color', 'default' => 'auto' ] ]; // allow overriding defaults foreach ( $defaults as $name => $default ) { if ( isset( $standardOpts[$name] ) ) { $standardOpts[$name]['default'] = $default; } } // Values in $opts take precedence return $opts + $standardOpts; } } PK ! �xQ.a .a PipelineUtils.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Utils; use Wikimedia\Assert\Assert; use Wikimedia\Assert\UnreachableException; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\DOM\Comment; use Wikimedia\Parsoid\DOM\Document; use Wikimedia\Parsoid\DOM\DocumentFragment; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\DOM\NodeList; use Wikimedia\Parsoid\DOM\Text; use Wikimedia\Parsoid\NodeData\DataMw; use Wikimedia\Parsoid\NodeData\DataParsoid; use Wikimedia\Parsoid\NodeData\TempData; use Wikimedia\Parsoid\Tokens\CommentTk; use Wikimedia\Parsoid\Tokens\EndTagTk; use Wikimedia\Parsoid\Tokens\EOFTk; use Wikimedia\Parsoid\Tokens\KV; use Wikimedia\Parsoid\Tokens\SelfclosingTagTk; use Wikimedia\Parsoid\Tokens\SourceRange; use Wikimedia\Parsoid\Tokens\TagTk; use Wikimedia\Parsoid\Tokens\Token; use Wikimedia\Parsoid\Wt2Html\Frame; /** * This file contains parsing pipeline related utilities. */ class PipelineUtils { /** * Creates a dom-fragment-token for processing 'content' (an array of tokens) * in its own subpipeline all the way to DOM. These tokens will be processed * by their own handler (DOMFragmentBuilder) in the last stage of the async * pipeline. * * srcOffsets should always be provided to process top-level page content in a * subpipeline. Without it, DSR computation and template wrapping cannot be done * in the subpipeline. While unpackDOMFragment can do this on unwrapping, that can * be a bit fragile and makes dom-fragments a leaky abstraction by leaking subpipeline * processing into the top-level pipeline. * * @param string|Token|array<Token|string> $content The array of tokens to process. * @param SourceRange $srcOffsets Wikitext source offsets (start/end) of these tokens. * @param array $opts Parsing options. * - Token token The token that generated the content. * - bool inlineContext Is this DOM fragment used in an inline context? * @return SelfclosingTagTk */ public static function getDOMFragmentToken( $content, SourceRange $srcOffsets, array $opts = [] ): SelfclosingTagTk { $token = $opts['token']; return new SelfclosingTagTk( 'mw:dom-fragment-token', [ new KV( 'contextTok', $token, $token->dataParsoid->tsr->expandTsrV() ), new KV( 'content', $content, $srcOffsets->expandTsrV() ), new KV( 'inlineContext', ( $opts['inlineContext'] ?? false ) ? "1" : "0" ), new KV( 'inPHPBlock', ( $opts['inPHPBlock'] ?? false ) ? "1" : "0" ), ] ); } /** * Processes content (wikitext, array of tokens, whatever) in its own * pipeline based on options. * * @param Env $env The environment/context for the expansion. * @param Frame $frame * The parent frame within which the expansion is taking place. * Used for template expansion and source text tracking. * @param string|Token|array<Token|string>|Element $content * How this content is processed depends on what kind of pipeline * is constructed specified by opts. * @param array $opts * Processing options that specify pipeline-type, opts, and callbacks. * - string pipelineType * - array pipelineOpts * - array tplArgs - if set, defines parameters for the child frame * - string tplArgs['name'] * - KV[] tplArgs['attribs'] * - string srcText - if set, defines the source text for the expansion * - SourceRange srcOffsets - if set, defines the range within the * source text that $content corresponds to * - bool sol Whether tokens should be processed in start-of-line context. * - bool toplevel Whether the pipeline is considered atTopLevel * @return array<Token|string>|DocumentFragment (depending on pipeline type) */ public static function processContentInPipeline( Env $env, Frame $frame, $content, array $opts ) { // Build a pipeline $pipeline = $env->getPipelineFactory()->getPipeline( $opts['pipelineType'], $opts['pipelineOpts'] ); $pipeline->init( [ // NOTE: some pipelines force toplevel to true 'toplevel' => $opts['toplevel'] ?? false, 'frame' => $frame, 'tplArgs' => $opts['tplArgs'] ?? null, 'srcText' => $opts['srcText'] ?? $frame->getSrcText(), 'srcOffsets' => $opts['srcOffsets'] ?? null, ] ); // Off the starting block ... ready, set, go! return $pipeline->parse( $content, [ 'sol' => $opts['sol'] ] ); } /** * Expands value all the way to DOM. * * @param Env $env * The environment/context for the expansion. * @param Frame $frame * The parent frame within which the expansion is taking place. * Used for template expansion and source text tracking. * @param array $v * The value to process. * The value is expected to be an associative array with a "html" property. * The html property is expanded to DOM only if it is an array (of tokens). * Non-arrays are passed back unexpanded. * @param bool $expandTemplates * Should any templates encountered here be expanded * (usually false for nested templates since they are never directly editable). * @param bool $inTemplate * Unexpanded templates can occur in the content of extension tags. * @return array */ public static function expandAttrValueToDOM( Env $env, Frame $frame, array $v, bool $expandTemplates, bool $inTemplate ): array { if ( is_array( $v['html'] ?? null ) ) { // Set up pipeline options $opts = [ 'pipelineType' => 'expanded-tokens-to-fragment', 'pipelineOpts' => [ 'attrExpansion' => true, 'inlineContext' => true, 'expandTemplates' => $expandTemplates, 'inTemplate' => $inTemplate ], 'srcOffsets' => $v['srcOffsets'], 'sol' => true ]; $content = array_merge( $v['html'], [ new EOFTk() ] ); $domFragment = self::processContentInPipeline( $env, $frame, $content, $opts ); // Since we aren't at the top level, data attrs // were not applied in cleanup. However, tmp // was stripped. $v['html'] = ContentUtils::ppToXML( $domFragment, [ 'innerXML' => true ] ); } // Remove srcOffsets after value is expanded, so they don't show // up in the output data-mw attribute unset( $v['srcOffsets'] ); return $v; } /** * @param Env $env * The environment/context for the expansion. * @param Frame $frame * The parent frame within which the expansion is taking place. * Used for template expansion and source text tracking. * @param array $vals * Array of values to expand. * Non-array elements of $vals are passed back unmodified. * If an array element, it is expected to be an associative array with a "html" property. * The html property is expanded to DOM only if it is an array (of tokens). * @param bool $expandTemplates * Should any templates encountered here be expanded * (usually false for nested templates since they are never directly editable). * @param bool $inTemplate * Unexpanded templates can occur in the content of extension tags. * @return array */ public static function expandAttrValuesToDOM( Env $env, $frame, array $vals, bool $expandTemplates, bool $inTemplate ): array { $ret = []; foreach ( $vals as $v ) { $ret[] = self::expandAttrValueToDOM( $env, $frame, $v, $expandTemplates, $inTemplate ); } return $ret; } /** * Convert a DOM node to a token. The node comes from a DOM whose data attributes * are stored outside the DOM. * * @param Element $node * @param array<string,string> $attrs * @return array{attrs:KV[],dataParsoid:?DataParsoid,dataMw:?DataMw} */ private static function domAttrsToTagAttrs( Element $node, array $attrs ): array { $out = []; foreach ( $attrs as $name => $value ) { if ( $name !== DOMDataUtils::DATA_OBJECT_ATTR_NAME ) { $out[] = new KV( $name, $value ); } } return [ 'attrs' => $out, 'dataParsoid' => DOMDataUtils::getDataParsoid( $node ), 'dataMw' => DOMDataUtils::validDataMw( $node ) ? DOMDataUtils::getDataMw( $node ) : null, ]; } /** * Convert a DOM to tokens. Data attributes for nodes are stored outside the DOM. * * @param Node $node The root of the DOM tree to convert to tokens * @param array<Token|string> $tokBuf This is where the tokens get stored * @return array */ private static function convertDOMtoTokens( Node $node, array $tokBuf ): array { if ( $node instanceof Element ) { $nodeName = DOMCompat::nodeName( $node ); $attrInfo = self::domAttrsToTagAttrs( $node, DOMUtils::attributes( $node ) ); if ( Utils::isVoidElement( $nodeName ) ) { $tokBuf[] = new SelfclosingTagTk( $nodeName, $attrInfo['attrs'], $attrInfo['dataParsoid'], $attrInfo['dataMw'] ); } else { $tokBuf[] = new TagTk( $nodeName, $attrInfo['attrs'], $attrInfo['dataParsoid'], $attrInfo['dataMw'] ); for ( $child = $node->firstChild; $child; $child = $child->nextSibling ) { $tokBuf = self::convertDOMtoTokens( $child, $tokBuf ); } $endTag = new EndTagTk( $nodeName ); // Keep stx parity if ( WTUtils::isLiteralHTMLNode( $node ) ) { $endTag->dataParsoid->stx = 'html'; } $tokBuf[] = $endTag; } } elseif ( $node instanceof Text ) { PHPUtils::pushArray( $tokBuf, TokenUtils::newlinesToNlTks( $node->nodeValue ) ); } elseif ( $node instanceof Comment ) { $tokBuf[] = new CommentTk( $node->nodeValue ); } else { // getWrapperTokens calls convertDOMToTokens with a Element // and children of dom elements are always text/comment/elements // which are all covered above. throw new UnreachableException( "Should never get here!" ); } return $tokBuf; } /** * Get tokens representing a DOM forest (from transclusions, extensions, * whatever that were generated as part of a separate processing pipeline) * in the token stream. These tokens will tunnel the subtree through the * token processing while preserving token stream semantics as if * the DOM had been converted to tokens. * * @param DocumentFragment $domFragment List of DOM nodes that need to be tunneled through. * @param array $opts * @see encapsulateExpansionHTML's doc. for more info about these options. * @return array<Token|string> List of token representatives. */ private static function getWrapperTokens( DocumentFragment $domFragment, array $opts ): array { if ( !$domFragment->hasChildNodes() ) { return [ new TagTk( 'span' ), new EndTagTk( 'span' ) ]; } $node = $domFragment->firstChild; // Do we represent this with inline or block elements? // This is to ensure that we get p-wrapping correct. // // * If all content is inline, we use inline-elements to represent this // so that this content gets swallowed into the P tag that wraps // adjacent inline content. // // * If any part of this is a block content, we treat extension content // independent of surrounding content and don't want inline content // here to be swallowed into a P tag that wraps adjacent inline content. // // This behavior ensures that we and clients can "drop-in" extension content // into the DOM without messing with fixing up paragraph tags of surrounding // content. It could potentially introduce minor rendering differences when // compared to PHP parser output, but we'll swallow it for now. $wrapperType = 'INLINE'; if ( !empty( $opts['pipelineOpts']['inlineContext'] ) ) { // If the DOM fragment is being processed in the context where P wrapping // has been suppressed, we represent the DOM fragment with inline-tokens. // // FIXME(SSS): Looks like we have some "impedance mismatch" here. But, this // is correct in scenarios where link-content or image-captions are being // processed in a sub-pipeline and we don't want a <div> in the link-caption // to cause the <a>..</a> to get split apart. // // Filed as T49963 } elseif ( !$opts['unpackOutput'] ) { // Fragments that won't be unpacked aren't amenable to inspection, since // the ultimate content is unknown. For example, refs shuttle content // through treebuilding that ends up in the references list. // // FIXME(arlolra): Do we need a mechanism to specify content // categories? } else { foreach ( $domFragment->childNodes as $n ) { if ( DOMUtils::isWikitextBlockNode( $n ) || DOMUtils::hasBlockElementDescendant( $n ) ) { $wrapperType = 'BLOCK'; break; } } } $wrapperName = null; if ( $wrapperType === 'BLOCK' && !DOMUtils::isWikitextBlockNode( $node ) ) { $wrapperName = 'div'; } elseif ( DOMCompat::nodeName( $node ) === 'a' ) { // Do not use 'A' as a wrapper node because it could // end up getting nested inside another 'A' and the DOM // structure can change where the wrapper tokens are no // longer siblings. // Ex: "[http://foo.com Bad nesting [[Here]]]. $wrapperName = 'span'; } elseif ( in_array( DOMCompat::nodeName( $node ), [ 'style', 'script' ], true ) && ( $node->nextSibling !== null ) ) { // <style>/<script> tags are not fostered, so if we're wrapping // more than a single node, they aren't a good representation for // the content. It can lead to fosterable content being inserted // in a fosterable position after treebuilding is done, which isn't // roundtrippable. $wrapperName = 'span'; } elseif ( !( $node instanceof Element ) ) { $wrapperName = 'span'; } else { $wrapperName = DOMCompat::nodeName( $node ); } if ( $node instanceof Element ) { Assert::invariant( // No need to look for data-mw as well. // Nodes that have data-mw also have data-parsoid. !$node->hasAttribute( 'data-parsoid' ), "Expected node to have its data attributes loaded" ); $nodeData = DOMDataUtils::getNodeData( $node )->cloneNodeData(); if ( $wrapperName !== DOMCompat::nodeName( $node ) ) { // Create a copy of the node without children $workNode = $node->ownerDocument->createElement( $wrapperName ); // Copy over attributes foreach ( DOMUtils::attributes( $node ) as $name => $value ) { // "typeof" is ignored since it'll be removed below. if ( $name !== 'typeof' ) { $workNode->setAttribute( $name, $value ); } } // We are applying a different wrapper. // So, node's data-parsoid isn't applicable. $nodeData->parsoid = new DataParsoid; } else { // Shallow clone since we don't want to convert the whole tree to tokens. $workNode = $node->cloneNode( false ); // Reset 'tsr' since it isn't applicable. Neither is // any auxiliary info like 'endTSR'. // FIXME: The above comment is only true if we are reusing // DOM fragments from cache from previous revisions in // incremental parsing scenarios. See T98992 if ( isset( $nodeData->parsoid->tsr ) ) { $nodeData->parsoid->tsr = null; } if ( isset( $nodeData->parsoid->tmp->endTSR ) ) { unset( $nodeData->parsoid->tmp->endTSR ); } // The "in transclusion" flag was set on the first child for template // wrapping in the nested pipeline, and doesn't apply to the dom // fragment wrapper in this pipeline. Keeping it around can induce // template wrapping of a foster box if the dom fragment is found in // a fosterable position. if ( isset( $nodeData->parsoid ) && $nodeData->parsoid->getTempFlag( TempData::IN_TRANSCLUSION ) ) { $nodeData->parsoid->tmp->setFlag( TempData::IN_TRANSCLUSION, false ); } } DOMDataUtils::setNodeData( $workNode, $nodeData ); } else { $workNode = $node->ownerDocument->createElement( $wrapperName ); } $tokens = self::convertDOMtoTokens( $workNode, [] ); // Remove the typeof attribute from the first token. // It will be replaced with mw:DOMFragment. $tokens[0]->removeAttribute( 'typeof' ); // Remove the about attribute from the first token. // We want to be able to distinguish when this wrapper was template // annotated. $tokens[0]->removeAttribute( 'about' ); return $tokens; } /** * Generates wrapper tokens for a HTML expansion -- the wrapper * tokens are placeholders that adequately represent semantics * of the HTML DOM for the purposes of additional token transformations * that will be applied to them. * * @param Env $env * The active environment/context. * @param Token $token * The token that generated the DOM. * @param array $expansion * - string html HTML of the expansion. * - DocumentFragment domFragment Outermost nodes of the HTML. * @param array $opts * - SourceRange tsr * The TSR to set on the generated tokens. This TSR is * used to compute DSR on the placeholder tokens. * The computed DSR is transferred over to the unpacked DOM * if setDSR is true (see below). * - bool setDSR * When the DOM fragment is unpacked, this option governs * whether the DSR from the placeholder node is transferred * over to the unpacked DOM or not. * For example: Cite, reused transclusions. * - bool fromCache * - array pipelineOpts * - bool unpackOutput * - string wrapperName * @return array<Token|string> */ public static function encapsulateExpansionHTML( Env $env, Token $token, array $expansion, array $opts ): array { $opts['unpackOutput'] ??= true; // Default // Get placeholder tokens to get our subdom through the token processing // stages. These will be finally unwrapped on the DOM. $toks = self::getWrapperTokens( $expansion['domFragment'], $opts ); $firstWrapperToken = $toks[0]; // Add the DOMFragment type so that we get unwrapped later. $fragmentType = 'mw:DOMFragment' . ( !$opts['unpackOutput'] ? '/sealed/' . $opts['wrapperName'] : '' ); $firstWrapperToken->setAttribute( 'typeof', $fragmentType ); // Assign the HTML fragment to the data-parsoid.html on the first wrapper token. $firstWrapperToken->dataParsoid->html = $expansion['html']; // Pass through setDSR flag if ( !empty( $opts['setDSR'] ) ) { $firstWrapperToken->dataParsoid->setTempFlag( TempData::SET_DSR, $opts['setDSR'] ); } // Pass through fromCache flag if ( !empty( $opts['fromCache'] ) ) { $firstWrapperToken->dataParsoid->setTempFlag( TempData::FROM_CACHE, $opts['fromCache'] ); } // Transfer the tsr. // The first token gets the full width, the following tokens zero width. $tokenTsr = $opts['tsr'] ?? $token->dataParsoid->tsr ?? null; if ( $tokenTsr ) { $firstWrapperToken->dataParsoid->tsr = $tokenTsr; $firstWrapperToken->dataParsoid->extTagOffsets = $token->dataParsoid->extTagOffsets ?? null; // XXX to investigate: if $tokenTsr->end is null, then we're losing // the 'hint' we'd like to provide here that this is a zero-width // source range. // ->end can be set to null by WikiLinkHandler::bailTokens() $endTsr = new SourceRange( $tokenTsr->end, $tokenTsr->end ); for ( $i = 1; $i < count( $toks ); $i++ ) { $toks[$i]->dataParsoid->tsr = clone $endTsr; } } return $toks; } private static function wrapAccum( Document $doc, array &$textCommentAccum ): void { // Wrap accumulated nodes in a span $span = $doc->createElement( 'span' ); $parentNode = $textCommentAccum[0]->parentNode; $parentNode->insertBefore( $span, $textCommentAccum[0] ); foreach ( $textCommentAccum as $n ) { $span->appendChild( $n ); } $dp = new DataParsoid; $dp->setTempFlag( TempData::WRAPPER ); DOMDataUtils::setDataParsoid( $span, $dp ); $textCommentAccum = []; } /** * Wrap text and comment nodes in a node list into spans, so that all * top-level nodes are elements. * * @param NodeList $nodes List of DOM nodes to wrap, mix of node types. * @param ?Node $startAt * @param ?Node $stopAt */ public static function addSpanWrappers( $nodes, ?Node $startAt = null, ?Node $stopAt = null ): void { $textCommentAccum = []; $doc = $nodes->item( 0 )->ownerDocument; // Build a real array out of nodes. // // Operating directly on DOM child-nodes array // and manipulating them by adding span wrappers // changes the traversal itself $nodeBuf = []; foreach ( $nodes as $node ) { $nodeBuf[] = $node; } $start = ( $startAt === null ); foreach ( $nodeBuf as $node ) { if ( !$start ) { if ( $startAt !== $node ) { continue; } $start = true; } if ( $node instanceof Text || $node instanceof Comment ) { $textCommentAccum[] = $node; } elseif ( count( $textCommentAccum ) ) { self::wrapAccum( $doc, $textCommentAccum ); } if ( $node === $stopAt ) { break; } } if ( count( $textCommentAccum ) ) { self::wrapAccum( $doc, $textCommentAccum ); } } /** * Convert a HTML5 DOM into a mw:DOMFragment and generate appropriate * tokens to insert into the token stream for further processing. * * The DOMPostProcessor will unpack the fragment and insert the HTML * back into the DOM. * * @param Env $env * The active environment/context. * @param Token $token * The token that generated the DOM. * @param DocumentFragment $domFragment * The DOM that the token expanded to. * @param array $opts * Options to be passed onto the encapsulation code * See encapsulateExpansionHTML's doc. for more info about these options. * @return array<Token|string> */ public static function tunnelDOMThroughTokens( Env $env, Token $token, DocumentFragment $domFragment, array $opts ): array { // Get placeholder tokens to get our subdom through the token processing // stages. These will be finally unwrapped on the DOM. $expansion = self::makeExpansion( $env, $domFragment ); return self::encapsulateExpansionHTML( $env, $token, $expansion, $opts ); } public static function makeExpansion( Env $env, DocumentFragment $domFragment ): array { $fragmentId = $env->newFragmentId(); $env->setDOMFragment( $fragmentId, $domFragment ); return [ 'domFragment' => $domFragment, 'html' => $fragmentId ]; } private static function doExtractExpansions( Env $env, array &$expansions, Node $node ): void { $nodes = null; $expAccum = null; while ( $node ) { if ( $node instanceof Element ) { if ( DOMUtils::matchTypeOf( $node, '#^mw:(Transclusion$|Extension/)#' ) && $node->hasAttribute( 'about' ) ) { $dp = DOMDataUtils::getDataParsoid( $node ); $about = DOMCompat::getAttribute( $node, 'about' ); $nodes = WTUtils::getAboutSiblings( $node, $about ); $key = null; if ( DOMUtils::hasTypeOf( $node, 'mw:Transclusion' ) ) { $expAccum = $expansions['transclusions']; $key = $dp->src; } elseif ( DOMUtils::matchTypeOf( $node, '#^mw:Extension/#' ) ) { $expAccum = $expansions['extensions']; $key = $dp->src; } else { $expAccum = $expansions['media']; // XXX gwicke: use proper key that is not // source-based? This also needs to work for // transclusion output. $key = null; } if ( $key ) { throw new UnreachableException( 'Callsite was not ported!' ); // FIXME: makeExpansion return type changed // $expAccum[$key] = self::makeExpansion( $env, $nodes ); } $node = end( $nodes ); } else { self::doExtractExpansions( $env, $expansions, $node->firstChild ); } } $node = $node->nextSibling; } } /** * Extract transclusion and extension expansions from a DOM, and return * them in a structure like this: * { * transclusions: { * 'key1': { * html: 'html1', * nodes: [<node1>, <node2>] * } * }, * extensions: { * 'key2': { * html: 'html2', * nodes: [<node1>, <node2>] * } * }, * files: { * 'key3': { * html: 'html3', * nodes: [<node1>, <node2>] * } * } * } * * @param Env $env * @param Element $body * @return array */ public static function extractExpansions( Env $env, Element $body ): array { $expansions = [ 'transclusions' => [], 'extensions' => [], 'media' => [] ]; // Kick off the extraction self::doExtractExpansions( $env, $expansions, $body->firstChild ); return $expansions; } /** * Fetches output of encapsulations that return HTML from the legacy parser */ public static function fetchHTML( Env $env, string $source ): ?DocumentFragment { $ret = $env->getDataAccess()->parseWikitext( $env->getPageConfig(), $env->getMetadata(), $source ); return $ret === '' ? null : DOMUtils::parseHTMLToFragment( $env->topLevelDoc, DOMUtils::stripPWrapper( $ret ) ); } } PK ! �&�o! o! DOMTraverser.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Utils; use Wikimedia\Parsoid\DOM\DocumentFragment; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; /** * Class for helping us traverse the DOM. * * This class currently does a pre-order depth-first traversal. * See {@link DOMPostOrder} for post-order traversal. */ class DOMTraverser { /** * List of handlers to call on each node. Each handler is an array with the following fields: * - action: a callable to call * - nodeName: if set, only call it on nodes with this name * @var array<array{action:callable,nodeName:string}> * @see addHandler() */ private $handlers = []; /** * Should the handlers be called on attribute-embedded-HTML strings? */ private bool $applyToAttributeEmbeddedHTML; /** * @var bool */ private $traverseWithTplInfo; /** * @param bool $traverseWithTplInfo * @param bool $applyToAttributeEmbeddedHTML */ public function __construct( bool $traverseWithTplInfo = false, bool $applyToAttributeEmbeddedHTML = false ) { $this->traverseWithTplInfo = $traverseWithTplInfo; $this->applyToAttributeEmbeddedHTML = $applyToAttributeEmbeddedHTML; } /** * Add a handler to the DOM traverser. * * @param ?string $nodeName An optional node name filter * @param callable $action A callback, called on each node we traverse that matches nodeName. * Will be called with the following parameters: * - Node $node: the node being processed * - Env $env: the parser environment * - DTState $state: State. * Return value: Node|null|true. * - true: proceed normally * - Node: traversal will continue on the new node (further handlers will not be called * on the current node); after processing it and its siblings, it will continue with the * next sibling of the closest ancestor which has one. * - null: like the Node case, except there is no new node to process before continuing. */ public function addHandler( ?string $nodeName, callable $action ): void { $this->handlers[] = [ 'action' => $action, 'nodeName' => $nodeName, ]; } /** * @param Node $node * @param ?ParsoidExtensionAPI $extAPI * @param DTState|null $state * @return bool|mixed */ private function callHandlers( Node $node, ?ParsoidExtensionAPI $extAPI, ?DTState $state ) { $name = DOMCompat::nodeName( $node ); // Process embedded HTML first since the handlers below might // return a different node which aborts processing. By processing // attributes first, we ensure attribute are always processed. if ( $node instanceof Element && $this->applyToAttributeEmbeddedHTML ) { $self = $this; ContentUtils::processAttributeEmbeddedHTML( $extAPI, $node, static function ( string $html ) use ( $self, $extAPI, $state ) { $dom = $extAPI->htmlToDom( $html ); // We are processing a nested document (which by definition // is not a top-level document). // FIXME: // 1. This argument replicates existing behavior but is it sound? // In any case, we should first replicate existing behavior // and revisit this later. // 2. It is not clear if creating a *new* state is the right thing // or if reusing *parts* of the old state is the right thing. // One of the places where this matters is around the use of // $state->tplInfo. One could probably find arguments for either // direction. But, "independent parsing" semantics which Parsoid // is aiming for would lead us to use a new state or even a new // traversal object here and that feels a little bit "more correct" // than reusing partial state. $newState = $state ? new DTState( $state->env, $state->options, false ) : null; $self->traverse( $extAPI, $dom, $newState ); return $extAPI->domToHtml( $dom, true, true ); } ); } foreach ( $this->handlers as $handler ) { if ( $handler['nodeName'] === null || $handler['nodeName'] === $name ) { $result = call_user_func( $handler['action'], $node, $state ); if ( $result !== true ) { // Abort processing for this node return $result; } } } return true; } /** * Traverse the DOM and fire the handlers that are registered. * * Handlers can return * - the next node to process: aborts processing for current node (ie. no further handlers are * called) and continues processing on returned node. Essentially, that node and its siblings * replace the current node and its siblings for the purposes of the traversal; after they * are fully processed, the algorithm moves back to the parent of $workNode to look for * the next sibling. * - `null`: same as above, except it continues from the next sibling of the parent (or if * that does not exist, the next sibling of the grandparent etc). This is so that returning * `$workNode->nextSibling` works even when workNode is a last child of its parent. * - `true`: continues regular processing on current node. * * @param ?ParsoidExtensionAPI $extAPI * @param Node $workNode The starting node for the traversal. * The traversal could go beyond the subtree rooted at $workNode if * the handlers called during traversal return an arbitrary node elsewhere * in the DOM in which case the traversal scope can be pretty much the whole * DOM that $workNode is present in. This behavior would be confusing but * there is nothing in the traversal code to prevent that. * @param DTState|null $state */ public function traverse( ?ParsoidExtensionAPI $extAPI, Node $workNode, ?DTState $state = null ): void { $this->traverseInternal( true, $extAPI, $workNode, $state ); } /** * @param bool $isRootNode * @param ?ParsoidExtensionAPI $extAPI * @param Node $workNode * @param DTState|null $state */ private function traverseInternal( bool $isRootNode, ?ParsoidExtensionAPI $extAPI, Node $workNode, ?DTState $state ): void { while ( $workNode !== null ) { if ( $this->traverseWithTplInfo && $workNode instanceof Element ) { // Identify the first template/extension node. // You'd think the !tplInfo check isn't necessary since // we don't have nested transclusions, however, you can // get extensions in transclusions. if ( !( $state->tplInfo ?? null ) && WTUtils::isFirstEncapsulationWrapperNode( $workNode ) // Ensure this isn't just a meta marker, since we might // not be traversing after encapsulation. Note that the // valid data-mw assertion is the same test as used in // cleanup. && ( !WTUtils::isTplMarkerMeta( $workNode ) || DOMDataUtils::validDataMw( $workNode ) ) // Encapsulation info on sections should not be used to // traverse with since it's designed to be dropped and // may have expanded ranges. && !WTUtils::isParsoidSectionTag( $workNode ) ) { $about = DOMCompat::getAttribute( $workNode, 'about' ); $aboutSiblings = WTUtils::getAboutSiblings( $workNode, $about ); $state->tplInfo = (object)[ 'first' => $workNode, 'last' => end( $aboutSiblings ), 'clear' => false, ]; } } // Call the handlers on this workNode if ( $workNode instanceof DocumentFragment ) { $possibleNext = true; } else { $possibleNext = $this->callHandlers( $workNode, $extAPI, $state ); } // We may have walked passed the last about sibling or want to // ignore the template info in future processing. // In any case, it's up to the handler returning a possible next // to figure out. if ( $this->traverseWithTplInfo && ( $state->tplInfo->clear ?? false ) ) { $state->tplInfo = null; } if ( $possibleNext === true ) { // The 'continue processing' case if ( $workNode->hasChildNodes() ) { $this->traverseInternal( false, $extAPI, $workNode->firstChild, $state ); } if ( $isRootNode ) { // Confine the traverse to the tree rooted as the root node. // `$workNode->nextSibling` would take us outside that. $possibleNext = null; } else { $possibleNext = $workNode->nextSibling; } } elseif ( $isRootNode && $possibleNext !== $workNode ) { $isRootNode = false; } // Clear the template info after reaching the last about sibling. if ( $this->traverseWithTplInfo && ( ( $state->tplInfo->last ?? null ) === $workNode ) ) { $state->tplInfo = null; } $workNode = $possibleNext; } } } PK ! sٟ�D �D DOMCompat.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Utils; use Wikimedia\Assert\Assert; use Wikimedia\Parsoid\DOM\CharacterData; use Wikimedia\Parsoid\DOM\Document; use Wikimedia\Parsoid\DOM\DocumentFragment; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\DOM\Text; use Wikimedia\Parsoid\Utils\DOMCompat\TokenList; use Wikimedia\Parsoid\Wt2Html\XMLSerializer; use Wikimedia\RemexHtml\DOM\DOMBuilder; use Wikimedia\RemexHtml\HTMLData; use Wikimedia\RemexHtml\Tokenizer\Tokenizer; use Wikimedia\RemexHtml\TreeBuilder\Dispatcher; use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder; use Wikimedia\Zest\Zest; /** * Helper class that provides missing DOM level 3 methods for the PHP DOM classes. * For a DOM method $node->foo( $bar) the equivalent helper is DOMCompat::foo( $node, $bar ). * For a DOM property $node->foo there is a DOMCompat::getFoo( $node ) and * DOMCompat::setFoo( $node, $value ). * * Only implements the methods that are actually used by Parsoid. * * Because this class may be used by code outside Parsoid it tries to * be relatively tolerant of object types: you can call it either with * PHP's DOM* types or with a "proper" DOM implementation, and it will * attempt to Do The Right Thing regardless. As a result there are * generally not parameter type hints for DOM object types, and the * return types will be broad enough to accomodate the value a "real" * DOM implementation would return, as well as the values our * thunk will return. (For instance, we can't create a "real" NodeList * in our compatibility thunk.) */ class DOMCompat { /** * Tab, LF, FF, CR, space * @see https://infra.spec.whatwg.org/#ascii-whitespace */ private const ASCII_WHITESPACE = "\t\r\f\n "; /** * Create a new empty document. * This is abstracted because the process is a little different depending * on whether we're using Dodo or DOMDocument, and phan gets a little * confused by this. * @param bool $isHtml * @return Document */ public static function newDocument( bool $isHtml ) { // @phan-suppress-next-line PhanParamTooMany,PhanTypeInstantiateInterface return new Document( "1.0", "UTF-8" ); } /** * Return the lower-case version of the node name (HTML says this should * be capitalized). * @param Node $node * @return string */ public static function nodeName( Node $node ): string { return strtolower( $node->nodeName ); } /** * Get document body. * Unlike the spec we return it as a native PHP DOM object. * @param Document $document * @return Element|null * @see https://html.spec.whatwg.org/multipage/dom.html#dom-document-body */ public static function getBody( $document ) { // WARNING: this will not be updated if (for some reason) the // document body changes. if ( $document->body !== null ) { return $document->body; } foreach ( $document->documentElement->childNodes as $element ) { /** @var Element $element */ $nodeName = self::nodeName( $element ); if ( $nodeName === 'body' || $nodeName === 'frameset' ) { // Caching! $document->body = $element; // @phan-suppress-next-line PhanTypeMismatchReturnSuperType return $element; } } return null; } /** * Get document head. * Unlike the spec we return it as a native PHP DOM object. * @param Document $document * @return Element|null * @see https://html.spec.whatwg.org/multipage/dom.html#dom-document-head */ public static function getHead( $document ) { // Use an undeclared dynamic property as a cache. // WARNING: this will not be updated if (for some reason) the // document head changes. if ( isset( $document->head ) ) { return $document->head; } foreach ( $document->documentElement->childNodes as $element ) { /** @var Element $element */ if ( self::nodeName( $element ) === 'head' ) { $document->head = $element; // Caching! // @phan-suppress-next-line PhanTypeMismatchReturnSuperType return $element; } } return null; } /** * Get document title. * @param Document $document * @return string * @see https://html.spec.whatwg.org/multipage/dom.html#document.title */ public static function getTitle( $document ): string { $titleElement = self::querySelector( $document, 'title' ); return $titleElement ? self::stripAndCollapseASCIIWhitespace( $titleElement->textContent ) : ''; } /** * Set document title. * @param Document $document * @param string $title * @see https://html.spec.whatwg.org/multipage/dom.html#document.title */ public static function setTitle( $document, string $title ): void { $titleElement = self::querySelector( $document, 'title' ); if ( !$titleElement ) { $headElement = self::getHead( $document ); if ( $headElement ) { $titleElement = DOMUtils::appendToHead( $document, 'title' ); } } if ( $titleElement ) { $titleElement->textContent = $title; } } /** * Return the parent element, or null if the parent is not an element. * @param Node $node * @return Element|null * @see https://dom.spec.whatwg.org/#dom-node-parentelement */ public static function getParentElement( $node ) { $parent = $node->parentNode; if ( $parent && $parent->nodeType === XML_ELEMENT_NODE ) { /** @var Element $parent */ // @phan-suppress-next-line PhanTypeMismatchReturnSuperType return $parent; } return null; } /** * Return the descendant with the specified ID. * Workaround for https://bugs.php.net/bug.php?id=77686 and other issues related to * inconsistent indexing behavior. * XXX: 77686 is fixed in php 8.1.21 * @param Document|DocumentFragment $node * @param string $id * @return Element|null * @see https://dom.spec.whatwg.org/#dom-nonelementparentnode-getelementbyid */ public static function getElementById( $node, string $id ) { Assert::parameterType( self::or( Document::class, DocumentFragment::class, // For compatibility with code which might call this from // outside Parsoid. \DOMDocument::class, \DOMDocumentFragment::class ), $node, '$node' ); // @phan-suppress-next-line PhanTypeMismatchArgument Zest is declared to take DOMDocument\DOMElement $elements = Zest::getElementsById( $node, $id ); // @phan-suppress-next-line PhanTypeMismatchReturn return $elements[0] ?? null; } /** * Workaround bug in PHP's Document::getElementById() which doesn't * actually index the 'id' attribute unless you use the non-standard * `Element::setIdAttribute` method after the attribute is set; * see https://www.php.net/manual/en/domdocument.getelementbyid.php * for more details. * * @param Element $element * @param string $id The desired value for the `id` attribute on $element. * @see https://phabricator.wikimedia.org/T232390 */ public static function setIdAttribute( $element, string $id ): void { $element->setAttribute( 'id', $id ); $element->setIdAttribute( 'id', true );// phab:T232390 } /** * Return all descendants with the specified tag name. * Workaround for PHP's getElementsByTagName being inexplicably slow in some situations * and the lack of Element::getElementsByTagName(). * @param Document|Element $node * @param string $tagName * @return (iterable<Element>&\Countable)|array<Element> Either an array or an HTMLCollection object * @see https://dom.spec.whatwg.org/#dom-document-getelementsbytagname * @see https://dom.spec.whatwg.org/#dom-element-getelementsbytagname * @note Note that unlike the spec this method is not guaranteed to return a NodeList * (which cannot be freely constructed in PHP), just a traversable containing Elements. */ public static function getElementsByTagName( $node, string $tagName ): iterable { Assert::parameterType( self::or( Document::class, Element::class, // For compatibility with code which might call this from // outside Parsoid. \DOMDocument::class, \DOMElement::class ), $node, '$node' ); // @phan-suppress-next-line PhanTypeMismatchArgument Zest is declared to take DOMDocument\DOMElement $result = Zest::getElementsByTagName( $node, $tagName ); '@phan-var array<Element> $result'; // @var array<Element> $result return $result; } /** * Return the last child of the node that is an Element, or null otherwise. * @param Document|DocumentFragment|Element $node * @return Element|null * @see https://dom.spec.whatwg.org/#dom-parentnode-lastelementchild */ public static function getLastElementChild( $node ) { Assert::parameterType( self::or( Document::class, DocumentFragment::class, Element::class, // For compatibility with code which might call this from // outside Parsoid. \DOMDocument::class, \DOMDocumentFragment::class, \DOMElement::class ), $node, '$node' ); $lastChild = $node->lastChild; while ( $lastChild && $lastChild->nodeType !== XML_ELEMENT_NODE ) { $lastChild = $lastChild->previousSibling; } // @phan-suppress-next-line PhanTypeMismatchReturnSuperType return $lastChild; } /** * @param Document|DocumentFragment|Element $node * @param string $selector * @return Element|null * @see https://dom.spec.whatwg.org/#dom-parentnode-queryselector */ public static function querySelector( $node, string $selector ) { foreach ( self::querySelectorAll( $node, $selector ) as $el ) { return $el; } return null; } /** * @param Document|DocumentFragment|Element $node * @param string $selector * @return (iterable<Element>&\Countable)|array<Element> Either a NodeList or an array * @see https://dom.spec.whatwg.org/#dom-parentnode-queryselectorall * @note Note that unlike the spec this method is not guaranteed to return a NodeList * (which cannot be freely constructed in PHP), just a traversable containing Elements. */ public static function querySelectorAll( $node, string $selector ): iterable { Assert::parameterType( self::or( Document::class, DocumentFragment::class, Element::class, // For compatibility with code which might call this from // outside Parsoid. \DOMDocument::class, \DOMDocumentFragment::class, \DOMElement::class ), $node, '$node' ); // @phan-suppress-next-line PhanTypeMismatchArgument DOMNode return Zest::find( $selector, $node ); } /** * Return the last preceding sibling of the node that is an element, or null otherwise. * @param Node $node * @return Element|null * @see https://dom.spec.whatwg.org/#dom-nondocumenttypechildnode-previouselementsibling */ public static function getPreviousElementSibling( $node ) { Assert::parameterType( self::or( Element::class, CharacterData::class, // For compatibility with code which might call this from // outside Parsoid. \DOMElement::class, \DOMCharacterData::class ), $node, '$node' ); $previousSibling = $node->previousSibling; while ( $previousSibling && $previousSibling->nodeType !== XML_ELEMENT_NODE ) { $previousSibling = $previousSibling->previousSibling; } // @phan-suppress-next-line PhanTypeMismatchReturnSuperType return $previousSibling; } /** * Return the first following sibling of the node that is an element, or null otherwise. * @param Node $node * @return Element|null * @see https://dom.spec.whatwg.org/#dom-nondocumenttypechildnode-nextelementsibling */ public static function getNextElementSibling( $node ) { Assert::parameterType( self::or( Element::class, CharacterData::class, // For compatibility with code which might call this from // outside Parsoid. \DOMElement::class, \DOMCharacterData::class ), $node, '$node' ); $nextSibling = $node->nextSibling; while ( $nextSibling && $nextSibling->nodeType !== XML_ELEMENT_NODE ) { $nextSibling = $nextSibling->nextSibling; } // @phan-suppress-next-line PhanTypeMismatchReturnSuperType return $nextSibling; } /** * Removes the node from the document. * @param Element|CharacterData $node * @see https://dom.spec.whatwg.org/#dom-childnode-remove */ public static function remove( $node ): void { Assert::parameterType( self::or( Element::class, CharacterData::class, // For compatibility with code which might call this from // outside Parsoid. \DOMElement::class, \DOMCharacterData::class ), $node, '$node' ); if ( $node->parentNode ) { $node->parentNode->removeChild( $node ); } } /** * Get innerHTML. * @see DOMUtils::getFragmentInnerHTML() for the fragment version * @param Element $element * @return string * @see https://w3c.github.io/DOM-Parsing/#dom-innerhtml-innerhtml */ public static function getInnerHTML( $element ): string { return XMLSerializer::serialize( $element, [ 'innerXML' => true ] )['html']; } /** * Set innerHTML. * @see https://w3c.github.io/DOM-Parsing/#dom-innerhtml-innerhtml * @see DOMUtils::setFragmentInnerHTML() for the fragment version * @param Element $element * @param string $html */ public static function setInnerHTML( $element, string $html ): void { $domBuilder = new class( [ 'suppressHtmlNamespace' => true, ] ) extends DOMBuilder { /** @inheritDoc */ protected function createDocument( ?string $doctypeName = null, ?string $public = null, ?string $system = null ) { // @phan-suppress-next-line PhanTypeMismatchReturn return DOMCompat::newDocument( $doctypeName === 'html' ); } }; $treeBuilder = new TreeBuilder( $domBuilder ); $dispatcher = new Dispatcher( $treeBuilder ); $tokenizer = new Tokenizer( $dispatcher, $html, [ 'ignoreErrors' => true ] ); $tokenizer->execute( [ 'fragmentNamespace' => HTMLData::NS_HTML, 'fragmentName' => self::nodeName( $element ), ] ); // Empty the element self::replaceChildren( $element ); $frag = $domBuilder->getFragment(); '@phan-var Node $frag'; // @var Node $frag DOMUtils::migrateChildrenBetweenDocs( $frag, $element ); } /** * Get outerHTML. * @param Element $element * @return string * @see https://w3c.github.io/DOM-Parsing/#dom-element-outerhtml */ public static function getOuterHTML( $element ): string { return XMLSerializer::serialize( $element, [ 'addDoctype' => false ] )['html']; } /** * Return the value of an element attribute. * * Unlike PHP's version, this is spec-compliant and returns `null` if * the attribute is not present, allowing the caller to distinguish * between "the attribute exists but has the empty string as its value" * and "the attribute does not exist". * * @param Element $element * @param string $attributeName * @return ?string The attribute value, or `null` if the attribute does * not exist on the element. * @see https://dom.spec.whatwg.org/#dom-element-getattribute */ public static function getAttribute( $element, string $attributeName ): ?string { if ( !$element->hasAttribute( $attributeName ) ) { return null; } return $element->getAttribute( $attributeName ); } /** * Return the class list of this element. * @param Element $node * @return TokenList * @see https://dom.spec.whatwg.org/#dom-element-classlist */ public static function getClassList( $node ): TokenList { return new TokenList( $node ); } /** * @param string $text * @return string * @see https://infra.spec.whatwg.org/#strip-and-collapse-ascii-whitespace */ private static function stripAndCollapseASCIIWhitespace( string $text ): string { $ws = self::ASCII_WHITESPACE; return preg_replace( "/[$ws]+/", ' ', trim( $text, $ws ) ); } /** * @param Element|DocumentFragment $e */ private static function stripEmptyTextNodes( $e ): void { $c = $e->firstChild; while ( $c ) { $next = $c->nextSibling; if ( $c instanceof Text ) { if ( $c->nodeValue === '' ) { $e->removeChild( $c ); } } elseif ( $c instanceof Element ) { self::stripEmptyTextNodes( $c ); } $c = $next; } } /** * @param Element|DocumentFragment $elt root of the DOM tree that * needs to be normalized */ public static function normalize( $elt ): void { $elt->normalize(); // Now traverse the tree rooted at $elt and remove any stray empty text nodes // Unlike what https://www.w3.org/TR/DOM-Level-2-Core/core.html#ID-normalize says, // the PHP DOM's normalization leaves behind up to 1 empty text node. // See https://bugs.php.net/bug.php?id=78221 self::stripEmptyTextNodes( $elt ); } /** * ParentNode.replaceChildren() * https://developer.mozilla.org/en-US/docs/Web/API/ParentNode/replaceChildren * * @param Document|DocumentFragment|Element $parentNode * @param string|Node ...$nodes */ public static function replaceChildren( $parentNode, ...$nodes ): void { Assert::parameterType( self::or( Document::class, DocumentFragment::class, Element::class, // For compatibility with code which might call this from // outside Parsoid. \DOMDocument::class, \DOMDocumentFragment::class, \DOMElement::class ), $parentNode, '$parentNode' ); while ( $parentNode->firstChild ) { $parentNode->removeChild( $parentNode->firstChild ); } foreach ( $nodes as $node ) { if ( is_string( $node ) ) { $node = $parentNode->ownerDocument->createTextNode( $node ); } $parentNode->insertBefore( $node, null ); } } /** * Join class names together in a form suitable for Assert::parameterType. * @param class-string ...$args * @return string */ private static function or( ...$args ) { return implode( '|', $args ); } } PK ! �gG��X �X TokenUtils.phpnu �Iw�� <?php declare( strict_types = 1 ); /** * This file contains general utilities for: * (a) querying token properties and token types * (b) manipulating tokens, individually and as collections. */ namespace Wikimedia\Parsoid\Utils; use Wikimedia\Assert\Assert; use Wikimedia\Assert\UnreachableException; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\Core\DomSourceRange; use Wikimedia\Parsoid\Tokens\CommentTk; use Wikimedia\Parsoid\Tokens\EndTagTk; use Wikimedia\Parsoid\Tokens\EOFTk; use Wikimedia\Parsoid\Tokens\KV; use Wikimedia\Parsoid\Tokens\KVSourceRange; use Wikimedia\Parsoid\Tokens\NlTk; use Wikimedia\Parsoid\Tokens\SelfclosingTagTk; use Wikimedia\Parsoid\Tokens\SourceRange; use Wikimedia\Parsoid\Tokens\TagTk; use Wikimedia\Parsoid\Tokens\Token; use Wikimedia\Parsoid\Wikitext\Consts; class TokenUtils { public const SOL_TRANSPARENT_LINK_REGEX = '/(?:^|\s)mw:PageProp\/(?:Category|redirect|Language)(?=$|\s)/D'; /** * Gets a string type value for a token * @param Token|string $token * @return string */ public static function getTokenType( $token ): string { return is_string( $token ) ? 'string' : $token->getType(); } /** * @param string $name * @return bool */ public static function isWikitextBlockTag( string $name ): bool { return isset( Consts::$wikitextBlockElems[$name] ); } /** * In the legacy parser, these block tags open block-tag scope * See doBlockLevels in the PHP parser (includes/parser/Parser.php). * * @param string $name * @return bool */ public static function tagOpensBlockScope( string $name ): bool { return isset( Consts::$blockElems[$name] ) || isset( Consts::$alwaysBlockElems[$name] ); } /** * In the legacy parser, these block tags close block-tag scope * See doBlockLevels in the PHP parser (includes/parser/Parser.php). * * @param string $name * @return bool */ public static function tagClosesBlockScope( string $name ): bool { return isset( Consts::$antiBlockElems[$name] ) || isset( Consts::$neverBlockElems[$name] ); } /** * Is this a template token? * @param Token|string|null $token * @return bool */ public static function isTemplateToken( $token ): bool { return $token instanceof SelfclosingTagTk && $token->getName() === 'template'; } /** * Determine whether the current token was an HTML tag in wikitext. * * @param Token|string|null $token * @return bool */ public static function isHTMLTag( $token ): bool { return $token && !is_string( $token ) && ( $token instanceof TagTk || $token instanceof EndTagTk || $token instanceof SelfClosingTagTk ) && isset( $token->dataParsoid->stx ) && $token->dataParsoid->stx === 'html'; } /** * Is the token a DOMFragment type value? * * @param Token $token * @return bool */ public static function hasDOMFragmentType( Token $token ): bool { return self::matchTypeOf( $token, '#^mw:DOMFragment(/sealed/\w+)?$#D' ) !== null; } /** * Is the token a table tag? * * @param Token|string $token * @return bool */ public static function isTableTag( $token ): bool { return ( $token instanceof TagTk || $token instanceof EndTagTk ) && isset( Consts::$HTML['TableTags'][$token->getName()] ); } /** * Determine if token is a transparent link tag * * @param Token|string $token * @return bool */ public static function isSolTransparentLinkTag( $token ): bool { return ( $token instanceof SelfclosingTagTk || $token instanceof TagTk || $token instanceof EndTagTk ) && $token->getName() === 'link' && preg_match( self::SOL_TRANSPARENT_LINK_REGEX, $token->getAttributeV( 'rel' ) ?? '' ); } /** * Does this token represent a behavior switch? * * @param Env $env * @param Token|string $token * @return bool */ public static function isBehaviorSwitch( Env $env, $token ): bool { return $token instanceof SelfclosingTagTk && ( // Before BehaviorSwitchHandler (ie. PreHandler, etc.) $token->getName() === 'behavior-switch' || // After BehaviorSwitchHandler // (ie. ListHandler, ParagraphWrapper, etc.) ( $token->getName() === 'meta' && $token->hasAttribute( 'property' ) && preg_match( $env->getSiteConfig()->bswPagePropRegexp(), $token->getAttributeV( 'property' ) ?? '' ) ) ); } /** * This should come close to matching * {@link WTUtils::emitsSolTransparentSingleLineWT}, * without the single line caveat. * @param Env $env * @param Token|string $token * @return bool */ public static function isSolTransparent( Env $env, $token ): bool { if ( is_string( $token ) ) { return (bool)preg_match( '/^[ \t]*$/D', $token ); } elseif ( self::isSolTransparentLinkTag( $token ) ) { return true; } elseif ( $token instanceof CommentTk && !self::isTranslationUnitMarker( $env, $token ) ) { return true; } elseif ( self::isBehaviorSwitch( $env, $token ) ) { return true; } elseif ( !$token instanceof SelfclosingTagTk || $token->getName() !== 'meta' ) { return false; } else { // only metas left return !( isset( $token->dataParsoid->stx ) && $token->dataParsoid->stx === 'html' ); } } /** * HACK: Returns true if $token looks like a TU marker (<!--T:XXX-->) and if we could be in a * translate-annotated page. * @param Env $env * @param CommentTk $token * @return bool */ public static function isTranslationUnitMarker( Env $env, CommentTk $token ): bool { return $env->hasAnnotations && $env->getSiteConfig()->isAnnotationTag( 'translate' ) && preg_match( '/^T:/', $token->value ) === 1; } /** * Is token a transparent link tag? * * @param Token|string $token * @return bool */ public static function isEmptyLineMetaToken( $token ): bool { return $token instanceof SelfclosingTagTk && $token->getName() === 'meta' && $token->getAttributeV( 'typeof' ) === 'mw:EmptyLine'; } /** * Determine whether the token matches the given `typeof` attribute value. * * @param Token $t The token to test * @param string $typeRe Regular expression matching the expected value of * the `typeof` attribute. * @return ?string The matching `typeof` value, or `null` if there is * no match. */ public static function matchTypeOf( Token $t, string $typeRe ): ?string { $v = $t->getAttributeV( 'typeof' ); if ( $v === null ) { return null; } Assert::invariant( is_string( $v ), "Typeof is not simple" ); foreach ( preg_split( '/\s+/', $v, -1, PREG_SPLIT_NO_EMPTY ) as $ty ) { $count = preg_match( $typeRe, $ty ); Assert::invariant( $count !== false, "Bad regexp" ); if ( $count ) { return $ty; } } return null; } /** * Determine whether the token matches the given typeof attribute value. * * @param Token $t * @param string $type Expected value of "typeof" attribute, as a literal * string. * @return bool True if the token matches. */ public static function hasTypeOf( Token $t, string $type ): bool { return self::matchTypeOf( $t, '/^' . preg_quote( $type, '/' ) . '$/D' ) !== null; } /** * Shift TSR of a token * * PORT-FIXME: In JS this was sometimes called with $offset=undefined, which meant do * nothing by default, except if there was a third parameter set to true, in which case it * meant the same thing as $offset = null. We can't pass in undefined in PHP, so this should * usually be handled with isset() is the caller. But isset() returns true if the variable is * null, so let's use false instead of null for whatever the previous code meant by a null * offset. * * @param array<Token|string> $tokens * @param int|false $offset */ public static function shiftTokenTSR( array $tokens, $offset ): void { // Bail early if we can if ( $offset === 0 ) { return; } // JS b/c if ( $offset === null ) { $offset = false; } // update/clear tsr for ( $i = 0, $n = count( $tokens ); $i < $n; $i++ ) { $t = $tokens[$i]; switch ( is_object( $t ) ? get_class( $t ) : null ) { case TagTk::class: case SelfclosingTagTk::class: case NlTk::class: case CommentTk::class: case EndTagTk::class: $da = $t->dataParsoid; $tsr = $da->tsr; if ( $tsr ) { if ( $offset ) { $da->tsr = $tsr->offset( $offset ); } else { $da->tsr = null; } } if ( $offset && isset( $da->extTagOffsets ) ) { $da->extTagOffsets = $da->extTagOffsets->offset( $offset ); } // SSS FIXME: offset will always be available in // chunky-tokenizer mode in which case we wont have // buggy offsets below. The null scenario is only // for when the token-stream-patcher attempts to // reparse a string -- it is likely to only patch up // small string fragments and the complicated use cases // below should not materialize. // CSA: token-stream-patcher shouldn't have problems // now that $frame->srcText is always accurate? // content offsets for ext-links if ( $offset && isset( $da->tmp->extLinkContentOffsets ) ) { $da->tmp->extLinkContentOffsets = $da->tmp->extLinkContentOffsets->offset( $offset ); } // Process attributes if ( isset( $t->attribs ) ) { for ( $j = 0, $m = count( $t->attribs ); $j < $m; $j++ ) { $a = $t->attribs[$j]; if ( is_array( $a->k ) ) { self::shiftTokenTSR( $a->k, $offset ); } if ( is_array( $a->v ) ) { self::shiftTokenTSR( $a->v, $offset ); } // src offsets used to set mw:TemplateParams if ( !$offset ) { $a->srcOffsets = null; } elseif ( $a->srcOffsets !== null ) { $a->srcOffsets = $a->srcOffsets->offset( $offset ); } } } break; default: break; } } } /** * Strip EOFTk token from token chunk. * The EOFTk is expected to be the last token of the chunk. * * @param array &$tokens * @return array return the modified token array so that this call can be chained */ public static function stripEOFTkFromTokens( array &$tokens ): array { $n = count( $tokens ); if ( $n && $tokens[$n - 1] instanceof EOFTk ) { array_pop( $tokens ); } return $tokens; } /** * Convert string offsets * * Offset types are: * - 'byte': Bytes (UTF-8 encoding), e.g. PHP `substr()` or `strlen()`. * - 'char': Unicode code points (encoding irrelevant), e.g. PHP `mb_substr()` or `mb_strlen()`. * - 'ucs2': 16-bit code units (UTF-16 encoding), e.g. JavaScript `.substring()` or `.length`. * * Offsets that are mid-Unicode character are "rounded" up to the next full * character, i.e. the output offset will always point to the start of a * Unicode code point (or just past the end of the string). Offsets outside * the string are "rounded" to 0 or just-past-the-end. * * @note When constructing the array of offsets to pass to this method, * populate it with references as `$offsets[] = &$var;`. * * @param string $s Unicode string the offsets are offsets into, UTF-8 encoded. * @param string $from Offset type to convert from. * @param string $to Offset type to convert to. * @param int[] $offsets References to the offsets to convert. */ public static function convertOffsets( string $s, string $from, string $to, array $offsets ): void { static $valid = [ 'byte', 'char', 'ucs2' ]; if ( !in_array( $from, $valid, true ) ) { throw new \InvalidArgumentException( 'Invalid $from' ); } if ( !in_array( $to, $valid, true ) ) { throw new \InvalidArgumentException( 'Invalid $to' ); } $i = 0; $offsetCt = count( $offsets ); if ( $offsetCt === 0 ) { // Nothing to do return; } sort( $offsets, SORT_NUMERIC ); $bytePos = 0; $ucs2Pos = 0; $charPos = 0; $fromPos = &${$from . 'Pos'}; // @phan-suppress-current-line PhanPluginDollarDollar $toPos = &${$to . 'Pos'}; // @phan-suppress-current-line PhanPluginDollarDollar $byteLen = strlen( $s ); while ( $bytePos < $byteLen ) { // Update offsets that we've reached while ( $offsets[$i] <= $fromPos ) { $offsets[$i] = $toPos; if ( ++$i >= $offsetCt ) { return; } } // Update positions ++$charPos; $c = ord( $s[$bytePos] ) & 0xf8; switch ( $c ) { case 0x00: case 0x08: case 0x10: case 0x18: case 0x20: case 0x28: case 0x30: case 0x38: case 0x40: case 0x48: case 0x50: case 0x58: case 0x60: case 0x68: case 0x70: case 0x78: ++$bytePos; ++$ucs2Pos; break; case 0xc0: case 0xc8: case 0xd0: case 0xd8: $bytePos += 2; ++$ucs2Pos; break; case 0xe0: case 0xe8: $bytePos += 3; ++$ucs2Pos; break; case 0xf0: $bytePos += 4; $ucs2Pos += 2; break; default: throw new \InvalidArgumentException( '$s is not UTF-8' ); } } // Convert any offsets past the end of the string to the length of the // string. while ( $i < $offsetCt ) { $offsets[$i] = $toPos; ++$i; } } /** * Convert offsets in a token array * * @see TokenUtils::convertOffsets() * * @param string $s The offset reference string * @param string $from Offset type to convert from * @param string $to Offset type to convert to * @param array<Token|string|array> $tokens */ public static function convertTokenOffsets( string $s, string $from, string $to, array $tokens ): void { $offsets = []; /* @var array<int> $offsets */ self::collectOffsets( $tokens, static function ( $sr ) use ( &$offsets ) { if ( $sr instanceof DomSourceRange ) { // Adjust the widths to be actual character offsets if ( $sr->openWidth !== null ) { Assert::invariant( $sr->start !== null, "width w/o start" ); $sr->openWidth = $sr->start + $sr->openWidth; $offsets[] =& $sr->openWidth; } if ( $sr->closeWidth !== null ) { Assert::invariant( $sr->end !== null, "width w/o end" ); $sr->closeWidth = $sr->end - $sr->closeWidth; $offsets[] =& $sr->closeWidth; } } if ( $sr->start !== null ) { $offsets[] =& $sr->start; } if ( $sr->end !== null ) { $offsets[] =& $sr->end; } } ); self::convertOffsets( $s, $from, $to, $offsets ); self::collectOffsets( $tokens, static function ( $sr ) use ( &$offsets ) { if ( $sr instanceof DomSourceRange ) { // Adjust widths back from being character offsets if ( $sr->openWidth !== null ) { $sr->openWidth -= $sr->start; } if ( $sr->closeWidth !== null ) { $sr->closeWidth = $sr->end - $sr->closeWidth; } } } ); } /** * @param array<Token|string>|array<KV>|KV|Token|DomSourceRange|KVSourceRange|SourceRange|string $input * @param callable $offsetFunc */ private static function collectOffsets( $input, callable $offsetFunc ): void { if ( is_array( $input ) ) { foreach ( $input as $token ) { self::collectOffsets( $token, $offsetFunc ); } } elseif ( $input instanceof KV ) { self::collectOffsets( $input->k, $offsetFunc ); self::collectOffsets( $input->v, $offsetFunc ); if ( $input->srcOffsets ) { self::collectOffsets( $input->srcOffsets, $offsetFunc ); } } elseif ( $input instanceof Token ) { if ( isset( $input->dataParsoid->tsr ) ) { self::collectOffsets( $input->dataParsoid->tsr, $offsetFunc ); } if ( isset( $input->dataParsoid->tmp->extLinkContentOffsets ) ) { self::collectOffsets( $input->dataParsoid->tmp->extLinkContentOffsets, $offsetFunc ); } if ( isset( $input->dataParsoid->tokens ) ) { self::collectOffsets( $input->dataParsoid->tokens, $offsetFunc ); } if ( isset( $input->dataParsoid->extTagOffsets ) ) { self::collectOffsets( $input->dataParsoid->extTagOffsets, $offsetFunc ); } self::collectOffsets( $input->attribs, $offsetFunc ); } elseif ( $input instanceof KVSourceRange ) { self::collectOffsets( $input->key, $offsetFunc ); self::collectOffsets( $input->value, $offsetFunc ); } elseif ( $input instanceof SourceRange ) { // This includes DomSourceRange $offsetFunc( $input ); } } /** * Tests whether token represents an HTML entity. * Think `<span typeof="mw:Entity">`. * @param Token|string|null $token * @return bool */ public static function isEntitySpanToken( $token ): bool { return $token && $token instanceof TagTk && $token->getName() === 'span' && self::hasTypeOf( $token, 'mw:Entity' ); } /** * Transform `"\n"` and `"\r\n"` in the input string to {@link NlTk} tokens. * @param string $str * @return array (interspersed string and NlTk tokens) */ public static function newlinesToNlTks( string $str ): array { $toks = preg_split( '/\n|\r\n/', $str ); $ret = []; // Add one NlTk between each pair, hence toks.length-1 for ( $i = 0, $n = count( $toks ) - 1; $i < $n; $i++ ) { $ret[] = $toks[$i]; $ret[] = new NlTk( null ); } $ret[] = $toks[$i]; return $ret; } /** * Flatten/convert a token array into a string. * @param string|Token|array<Token|string> $tokens * @param bool $strict Whether to abort as soon as we find a token we * can't stringify. * @param array<string,bool|Env> $opts * @return string|array{0:string,1:Array<Token|string>} * The stringified tokens. If $strict is true, returns a two-element * array containing string prefix and the remainder of the tokens as * soon as we encounter something we can't stringify. * * Unsure why phan is whining about $opts array accesses. * So for now, I am simply suppressing those warnings. */ public static function tokensToString( $tokens, bool $strict = false, array $opts = [] ) { if ( is_string( $tokens ) ) { return $tokens; } if ( !is_array( $tokens ) ) { $tokens = [ $tokens ]; } $out = ''; for ( $i = 0, $l = count( $tokens ); $i < $l; $i++ ) { $token = $tokens[$i]; if ( $token === null ) { throw new UnreachableException( "No nulls expected." ); } elseif ( $token instanceof KV ) { // Since this function is occasionally called on KV->v, // whose signature recursively includes KV[], a mismatch with // this function, we assert that those values are only // included in safe places that don't intend to stringify // their tokens. throw new UnreachableException( "No KVs expected." ); } elseif ( is_string( $token ) ) { $out .= $token; } elseif ( is_array( $token ) ) { Assert::invariant( !$strict, "strict case handled above" ); $out .= self::tokensToString( $token, $strict, $opts ); } elseif ( $token instanceof CommentTk || ( empty( $opts['retainNLs'] ) && $token instanceof NlTk ) ) { // strip comments and newlines } elseif ( !empty( $opts['stripEmptyLineMeta'] ) && self::isEmptyLineMetaToken( $token ) ) { // If requested, strip empty line meta tokens too. } elseif ( !empty( $opts['includeEntities'] ) && self::isEntitySpanToken( $token ) ) { $out .= $token->dataParsoid->src; $i += 2; // Skip child and end tag. } elseif ( $strict ) { // If strict, return accumulated string on encountering first non-text token return [ $out, array_slice( $tokens, $i ) ]; } elseif ( // This option shouldn't be used if the tokens have been // expanded to DOM !empty( $opts['unpackDOMFragments'] ) && ( $token instanceof TagTk || $token instanceof SelfclosingTagTk ) && self::hasDOMFragmentType( $token ) ) { // Handle dom fragments $domFragment = $opts['env']->getDOMFragment( $token->dataParsoid->html ); // Calling `env->removeDOMFragment()` here is case dependent // but should be rare enough when permissible that it can be // ignored. // FIXME: The correct thing to do would be to return // `$domFragment.innerHTML` for the current scenarios where // `unpackDOMFragments` is used (expanded attribute // values and reparses thereof) but we'd need to remove // the span wrapping and typeof annotation of extension // content and nowikis. Since we're primarily expecting // to find <translate> and <nowiki> here, this will do. $out .= $domFragment->textContent; if ( $token instanceof TagTk ) { $i += 1; // Skip the EndTagTK Assert::invariant( $i >= $l || $tokens[$i] instanceof EndTagTk, "tag should be followed by endtag" ); } } } return $out; } /** * Convert an array of key-value pairs into a hash of keys to values. * For duplicate keys, the last entry wins. * @param array<KV> $kvs * @return array<string,array<Token|string>>|array<string,string> */ public static function kvToHash( array $kvs ): array { $res = []; foreach ( $kvs as $kv ) { $key = trim( self::tokensToString( $kv->k ) ); // SSS FIXME: Temporary fix to handle extensions which use // entities in attribute values. We need more robust handling // of non-string template attribute values in general. $val = self::tokensToString( $kv->v ); $res[mb_strtolower( $key )] = self::tokenTrim( $val ); } return $res; } /** * Trim space and newlines from leading and trailing text tokens. * @param string|Token|(Token|string)[] $tokens * @return string|Token|(Token|string)[] */ public static function tokenTrim( $tokens ) { if ( !is_array( $tokens ) ) { if ( is_string( $tokens ) ) { return trim( $tokens ); } return $tokens; } $n = count( $tokens ); // strip leading space foreach ( $tokens as &$token ) { if ( $token instanceof NlTk ) { $token = ''; } elseif ( is_string( $token ) ) { $token = preg_replace( '/^\s+/', '', $token, 1 ); if ( $token !== '' ) { break; } } else { break; } } // strip trailing space for ( $i = $n - 1; $i >= 0; $i-- ) { $token = &$tokens[$i]; if ( $token instanceof NlTk ) { $token = ''; // replace newline with empty } elseif ( is_string( $token ) ) { $token = preg_replace( '/\s+$/D', '', $token, 1 ); if ( $token !== '' ) { break; } } else { break; } } return $tokens; } /** * Checks whether the provided meta tag token is an annotation start token * @param Token $t * @return bool */ public static function isAnnotationStartToken( Token $t ): bool { $type = self::matchTypeOf( $t, WTUtils::ANNOTATION_META_TYPE_REGEXP ); return $type !== null && !str_ends_with( $type, '/End' ); } /** * Checks whether the provided meta tag token is an annotation end token * @param Token $t * @return bool */ public static function isAnnotationEndToken( Token $t ): bool { $type = self::matchTypeOf( $t, WTUtils::ANNOTATION_META_TYPE_REGEXP ); return $type !== null && str_ends_with( $type, '/End' ); } } PK ! Be�� � CompatJsonCodec.phpnu �Iw�� <?php declare( strict_types=1 ); /** * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * http://www.gnu.org/copyleft/gpl.html * * @file */ namespace Wikimedia\Parsoid\Utils; use JsonSerializable; use Wikimedia\JsonCodec\JsonClassCodec; use Wikimedia\JsonCodec\JsonCodec; /** * This is a "compatible" JSON codec for the use of Parsoid test runners, etc. * In addition to supporting objects which implement `JsonCodecable`, it * tries to handle objects we might get from mediawiki-core which implement * JsonSerializable and other legacy serialization types. * * This should not be relied on for production! * * However, it is good enough to use in test cases, etc, and hopefully makes * them a little bit less fragile by not blowing up if it gets a martian * object from mediawiki-core stuck into the parser's extension data. */ class CompatJsonCodec extends JsonCodec { /** @inheritDoc */ protected function codecFor( string $className ): ?JsonClassCodec { $codec = parent::codecFor( $className ); if ( $codec === null && is_a( $className, JsonSerializable::class, true ) ) { $codec = new class() implements JsonClassCodec { /** @inheritDoc */ public function toJsonArray( $obj ): array { return $obj->jsonSerialize(); } /** * @param class-string $className * @param array $json * @return never */ public function newFromJsonArray( string $className, array $json ) { // We can't use the core JsonUnserializable interface // (even blindly) because we can't make a non-null // JsonUnserializer which is required as the first argument // T346829, T327439#8634426 // That's ok, though, we can still *serialize* objects for // test cases even if we can't unserialize them. throw new \InvalidArgumentException( "Unserialization of this $className not possible" ); } /** @inheritDoc */ public function jsonClassHintFor( string $className, string $keyName ) { return null; } }; // Cache this for future use $this->addCodecFor( $className, $codec ); } return $codec; } } PK ! �z� � TitleException.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Utils; use RuntimeException; /** * Exception thrown for invalid titles * @note Replaces JS TitleError, because that implies it extends Error rather than Exception */ class TitleException extends RuntimeException { public $type; public $title; public function __construct( string $message, string $type, string $title ) { parent::__construct( $message ); $this->type = $type; $this->title = $title; } } PK ! �Z�-d d DOMCompat/TokenList.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Utils\DOMCompat; use Iterator; use LogicException; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\Utils\DOMCompat; /** * Implements the parts of DOMTokenList interface which are used by Parsoid. * @note To improve performance, no effort is made to keep the TokenList in sync * with the real class list if that is changed from elsewhere. * @see https://dom.spec.whatwg.org/#interface-domtokenlist */ class TokenList implements Iterator { /** @var Element The node whose classes are listed. */ protected $node; /** @var string|bool Copy of the attribute text, used for change detection. */ private $attribute = false; // Testing element existence with a list is less painful than returning numeric keys // with a map, so let's go with that. /** @var string[] */ private $classList; /** * @param Element $node The node whose classes are listed. */ public function __construct( $node ) { $this->node = $node; $this->lazyLoadClassList(); } /** * Return the number of CSS classes this element has. * @return int * @see https://dom.spec.whatwg.org/#dom-domtokenlist-length */ public function getLength(): int { $this->lazyLoadClassList(); return count( $this->classList ); } /** * Checks if the element has a given CSS class. * @param string $token * @return bool * @see https://dom.spec.whatwg.org/#dom-domtokenlist-contains */ public function contains( string $token ): bool { $this->lazyLoadClassList(); return in_array( $token, $this->classList, true ); } /** * Add CSS classes to the element. * @param string ...$tokens List of classes to add * @see https://dom.spec.whatwg.org/#dom-domtokenlist-add */ public function add( string ...$tokens ): void { $this->lazyLoadClassList(); $changed = false; foreach ( $tokens as $token ) { if ( !in_array( $token, $this->classList, true ) ) { $changed = true; $this->classList[] = $token; } } if ( $changed ) { $this->saveClassList(); } } /** * Remove CSS classes from the element. * @param string ...$tokens List of classes to remove * @see https://dom.spec.whatwg.org/#dom-domtokenlist-remove */ public function remove( string ...$tokens ): void { $this->lazyLoadClassList(); $changed = false; foreach ( $tokens as $token ) { $index = array_search( $token, $this->classList, true ); if ( $index !== false ) { array_splice( $this->classList, $index, 1 ); $changed = true; } } if ( $changed ) { $this->saveClassList(); } } public function current(): string { $this->lazyLoadClassList(); return current( $this->classList ); } public function next(): void { $this->lazyLoadClassList(); next( $this->classList ); } public function key(): ?int { $this->lazyLoadClassList(); return key( $this->classList ); } public function valid(): bool { $this->lazyLoadClassList(); return key( $this->classList ) !== null; } public function rewind(): void { $this->lazyLoadClassList(); reset( $this->classList ); } /** * Set the classList property based on the class attribute of the wrapped element. */ private function lazyLoadClassList(): void { $attrib = DOMCompat::getAttribute( $this->node, 'class' ) ?? ''; if ( $attrib !== $this->attribute ) { $this->attribute = $attrib; $this->classList = preg_split( '/\s+/', $attrib, -1, PREG_SPLIT_NO_EMPTY ); } } /** * Set the class attribute of the wrapped element based on the classList property. */ private function saveClassList(): void { if ( $this->classList === null ) { throw new LogicException( 'no class list to set' ); } elseif ( $this->classList === [] ) { $this->attribute = ''; $this->node->removeAttribute( 'class' ); } else { $this->attribute = implode( ' ', $this->classList ); $this->node->setAttribute( 'class', $this->attribute ); } } } PK ! �FC��7 �7 ContentUtils.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Utils; use Closure; use Wikimedia\Assert\Assert; use Wikimedia\Assert\UnreachableException; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\Core\DomSourceRange; use Wikimedia\Parsoid\DOM\Document; use Wikimedia\Parsoid\DOM\DocumentFragment; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; use Wikimedia\Parsoid\Wt2Html\XMLSerializer; /** * These utilities are for processing content that's generated * by parsing source input (ex: wikitext) */ class ContentUtils { /** * XML Serializer. * * @param Node $node * @param array $options XMLSerializer options. * @return string */ public static function toXML( Node $node, array $options = [] ): string { return XMLSerializer::serialize( $node, $options )['html']; } /** * dataobject aware XML serializer, to be used in the DOM post-processing phase. * * @param Node $node * @param array $options * @return string */ public static function ppToXML( Node $node, array $options = [] ): string { DOMDataUtils::visitAndStoreDataAttribs( $node, $options ); return self::toXML( $node, $options ); } /** * XXX: Don't use this outside of testing. It shouldn't be necessary * to create new documents when parsing or serializing. A document lives * on the environment which can be used to create fragments. The bag added * as a dynamic property to the PHP wrapper around the libxml doc * is at risk of being GC-ed. * * @param string $html * @param bool $validateXMLNames * @return Document */ public static function createDocument( string $html = '', bool $validateXMLNames = false ): Document { $doc = DOMUtils::parseHTML( $html, $validateXMLNames ); DOMDataUtils::prepareDoc( $doc ); return $doc; } /** * XXX: Don't use this outside of testing. It shouldn't be necessary * to create new documents when parsing or serializing. A document lives * on the environment which can be used to create fragments. The bag added * as a dynamic property to the PHP wrapper around the libxml doc * is at risk of being GC-ed. * * @param string $html * @param array $options * @return Document */ public static function createAndLoadDocument( string $html, array $options = [] ): Document { $doc = self::createDocument( $html, $options['validateXMLNames'] ?? false ); DOMDataUtils::visitAndLoadDataAttribs( DOMCompat::getBody( $doc ), $options ); return $doc; } /** * @param Document $doc * @param string $html * @param array $options * @return DocumentFragment */ public static function createAndLoadDocumentFragment( Document $doc, string $html, array $options = [] ): DocumentFragment { $domFragment = $doc->createDocumentFragment(); DOMUtils::setFragmentInnerHTML( $domFragment, $html ); DOMDataUtils::visitAndLoadDataAttribs( $domFragment, $options ); return $domFragment; } /** * Pull the data-parsoid script element out of the doc before serializing. * * @param Node $node * @param array $options XMLSerializer options. * @return array */ public static function extractDpAndSerialize( Node $node, array $options = [] ): array { $doc = DOMUtils::isBody( $node ) ? $node->ownerDocument : $node; $pb = DOMDataUtils::extractPageBundle( $doc ); $out = XMLSerializer::serialize( $node, $options ); $out['pb'] = $pb; return $out; } /** * Strip Parsoid-inserted section wrappers, annotation wrappers, and synthetic nodes * (fallback id spans with HTML4 ids for headings, auto-generated TOC metas * and possibly other such in the future) from the DOM. * * @param Element $node */ public static function stripUnnecessaryWrappersAndSyntheticNodes( Element $node ): void { $n = $node->firstChild; while ( $n ) { $next = $n->nextSibling; if ( $n instanceof Element ) { if ( DOMCompat::nodeName( $n ) === 'meta' && ( DOMDataUtils::getDataMw( $n )->autoGenerated ?? false ) ) { // Strip auto-generated synthetic meta tags $n->parentNode->removeChild( $n ); } elseif ( WTUtils::isFallbackIdSpan( $n ) ) { // Strip <span typeof='mw:FallbackId' ...></span> $n->parentNode->removeChild( $n ); } else { // Recurse into subtree before stripping this self::stripUnnecessaryWrappersAndSyntheticNodes( $n ); // Strip <section> tags and synthetic extended-annotation-region wrappers if ( WTUtils::isParsoidSectionTag( $n ) || DOMUtils::hasTypeOf( $n, 'mw:ExtendedAnnRange' ) ) { DOMUtils::migrateChildren( $n, $n->parentNode, $n ); $n->parentNode->removeChild( $n ); } } } $n = $next; } } /** * Extensions might be interested in examining their content embedded * in data-mw attributes that don't otherwise show up in the DOM. * * Ex: inline media captions that aren't rendered, language variant markup, * attributes that are transcluded. More scenarios might be added later. * * @param ParsoidExtensionAPI $extAPI * @param Element $elt The node whose data attributes need to be examined * @param Closure $proc The processor that will process the embedded HTML * Signature: (string) -> string * This processor will be provided the HTML string as input * and is expected to return a possibly modified string. */ public static function processAttributeEmbeddedHTML( ParsoidExtensionAPI $extAPI, Element $elt, Closure $proc ): void { if ( !$elt->hasAttribute( 'typeof' ) ) { return; } // Expanded attributes if ( DOMUtils::matchTypeOf( $elt, '/^mw:ExpandedAttrs$/' ) ) { $dmw = DOMDataUtils::getDataMw( $elt ); if ( $dmw->attribs ?? null ) { foreach ( $dmw->attribs as $a ) { // Look in both key and value of the DataMwAttrib foreach ( [ 'key', 'value' ] as $part ) { if ( !is_string( $a->$part ) && isset( $a->$part['html'] ) ) { $a->$part['html'] = $proc( $a->$part['html'] ); } } } } } // Language variant markup if ( DOMUtils::matchTypeOf( $elt, '/^mw:LanguageVariant$/' ) ) { $dmwv = DOMDataUtils::getJSONAttribute( $elt, 'data-mw-variant', null ); if ( $dmwv ) { if ( isset( $dmwv->disabled ) ) { $dmwv->disabled->t = $proc( $dmwv->disabled->t ); } if ( isset( $dmwv->twoway ) ) { foreach ( $dmwv->twoway as $l ) { $l->t = $proc( $l->t ); } } if ( isset( $dmwv->oneway ) ) { foreach ( $dmwv->oneway as $l ) { $l->f = $proc( $l->f ); $l->t = $proc( $l->t ); } } if ( isset( $dmwv->filter ) ) { $dmwv->filter->t = $proc( $dmwv->filter->t ); } DOMDataUtils::setJSONAttribute( $elt, 'data-mw-variant', $dmwv ); } } // Inline media -- look inside the data-mw attribute if ( WTUtils::isInlineMedia( $elt ) ) { $dmw = DOMDataUtils::getDataMw( $elt ); $caption = $dmw->caption ?? null; if ( $caption ) { $dmw->caption = $proc( $caption ); } } // Process extension-specific embedded HTML $extTagName = WTUtils::getExtTagName( $elt ); if ( $extTagName ) { $extConfig = $extAPI->getSiteConfig()->getExtTagConfig( $extTagName ); if ( $extConfig['options']['wt2html']['embedsHTMLInAttributes'] ?? false ) { $tagHandler = $extAPI->getSiteConfig()->getExtTagImpl( $extTagName ); $tagHandler->processAttributeEmbeddedHTML( $extAPI, $elt, $proc ); } } } /** * Shift the DOM Source Range (DSR) of a DOM fragment. * @param Env $env * @param Node $rootNode * @param callable $dsrFunc * @param ParsoidExtensionAPI $extAPI * @return Node Returns the $rootNode passed in to allow chaining. */ public static function shiftDSR( Env $env, Node $rootNode, callable $dsrFunc, ParsoidExtensionAPI $extAPI ): Node { $doc = $rootNode->ownerDocument; $convertString = static function ( $str ) { // Stub $convertString out to allow definition of a pair of // mutually-recursive functions. return $str; }; $convertNode = static function ( Node $node ) use ( $env, $extAPI, $dsrFunc, &$convertString, &$convertNode ) { if ( !( $node instanceof Element ) ) { return; } $dp = DOMDataUtils::getDataParsoid( $node ); if ( isset( $dp->dsr ) ) { $dp->dsr = $dsrFunc( clone $dp->dsr ); // We don't need to setDataParsoid because dp is not a copy // This is a bit of a hack, but we use this function to // clear DSR properties as well. See below as well. if ( $dp->dsr === null ) { unset( $dp->dsr ); } } $tmp = $dp->getTemp(); if ( isset( $tmp->origDSR ) ) { // Even though tmp shouldn't escape Parsoid, go ahead and // convert to enable hybrid testing. $tmp->origDSR = $dsrFunc( clone $tmp->origDSR ); if ( $tmp->origDSR === null ) { unset( $tmp->origDSR ); } } if ( isset( $dp->extTagOffsets ) ) { $dp->extTagOffsets = $dsrFunc( clone $dp->extTagOffsets ); if ( $dp->extTagOffsets === null ) { unset( $dp->extTagOffsets ); } } // Handle embedded HTML in attributes self::processAttributeEmbeddedHTML( $extAPI, $node, $convertString ); // DOMFragments will have already been unpacked when DSR shifting is run if ( DOMUtils::hasTypeOf( $node, 'mw:DOMFragment' ) ) { throw new UnreachableException( "Shouldn't encounter these nodes here." ); } // However, extensions can choose to handle sealed fragments whenever // they want and so may be returned in subpipelines which could // subsequently be shifted if ( DOMUtils::matchTypeOf( $node, '#^mw:DOMFragment/sealed/\w+$#D' ) ) { $dp = DOMDataUtils::getDataParsoid( $node ); if ( $dp->html ?? null ) { $domFragment = $env->getDOMFragment( $dp->html ); DOMPostOrder::traverse( $domFragment, $convertNode ); } } }; $convertString = function ( string $str ) use ( $doc, $env, $convertNode ): string { $node = self::createAndLoadDocumentFragment( $doc, $str ); DOMPostOrder::traverse( $node, $convertNode ); return self::ppToXML( $node, [ 'innerXML' => true ] ); }; DOMPostOrder::traverse( $rootNode, $convertNode ); return $rootNode; // chainable } /** * Convert DSR offsets in a Document between utf-8/ucs2/codepoint * indices. * * Offset types are: * - 'byte': Bytes (UTF-8 encoding), e.g. PHP `substr()` or `strlen()`. * - 'char': Unicode code points (encoding irrelevant), e.g. PHP `mb_substr()` or `mb_strlen()`. * - 'ucs2': 16-bit code units (UTF-16 encoding), e.g. JavaScript `.substring()` or `.length`. * * @see TokenUtils::convertTokenOffsets for a related function on tokens. * * @param Env $env * @param Document $doc The document to convert * @param string $from Offset type to convert from. * @param string $to Offset type to convert to. */ public static function convertOffsets( Env $env, Document $doc, string $from, string $to ): void { $env->setCurrentOffsetType( $to ); if ( $from === $to ) { return; // Hey, that was easy! } $offsetMap = []; $offsets = []; $collect = static function ( int $n ) use ( &$offsetMap, &$offsets ) { if ( !array_key_exists( $n, $offsetMap ) ) { $box = (object)[ 'value' => $n ]; $offsetMap[$n] = $box; $offsets[] =& $box->value; } }; // Collect DSR offsets throughout the document $collectDSR = static function ( DomSourceRange $dsr ) use ( $collect ) { if ( $dsr->start !== null ) { $collect( $dsr->start ); $collect( $dsr->innerStart() ); } if ( $dsr->end !== null ) { $collect( $dsr->innerEnd() ); $collect( $dsr->end ); } return $dsr; }; $body = DOMCompat::getBody( $doc ); $extAPI = new ParsoidExtensionAPI( $env ); self::shiftDSR( $env, $body, $collectDSR, $extAPI ); if ( count( $offsets ) === 0 ) { return; /* nothing to do (shouldn't really happen) */ } // Now convert these offsets TokenUtils::convertOffsets( $env->topFrame->getSrcText(), $from, $to, $offsets ); // Apply converted offsets $applyDSR = static function ( DomSourceRange $dsr ) use ( $offsetMap ) { $start = $dsr->start; $openWidth = $dsr->openWidth; if ( $start !== null ) { $start = $offsetMap[$start]->value; $openWidth = $offsetMap[$dsr->innerStart()]->value - $start; } $end = $dsr->end; $closeWidth = $dsr->closeWidth; if ( $end !== null ) { $end = $offsetMap[$end]->value; $closeWidth = $end - $offsetMap[$dsr->innerEnd()]->value; } return new DomSourceRange( $start, $end, $openWidth, $closeWidth ); }; self::shiftDSR( $env, $body, $applyDSR, $extAPI ); } /** * @param Node $node * @param array $options * @return string */ private static function dumpNode( Node $node, array $options ): string { return self::toXML( $node, $options + [ 'saveData' => true ] ); } /** * Dump the DOM with attributes. * * @param Node $rootNode * @param string $title * @param array $options Associative array of options: * - dumpFragmentMap: Dump the fragment map from env * - quiet: Suppress separators * * storeDataAttribs options: * - discardDataParsoid * - keepTmp * - storeInPageBundle * - storeDiffMark * - env * - idIndex * * XMLSerializer options: * - smartQuote * - innerXML * - captureOffsets * - addDoctype * @return string The dump result */ public static function dumpDOM( Node $rootNode, string $title = '', array $options = [] ): string { if ( !empty( $options['dumpFragmentMap'] ) ) { Assert::invariant( isset( $options['env'] ), "env should be set" ); } $buf = ''; if ( empty( $options['quiet'] ) ) { $buf .= "----- {$title} -----\n"; } $buf .= self::dumpNode( $rootNode, $options ) . "\n"; // Dump cached fragments if ( !empty( $options['dumpFragmentMap'] ) ) { foreach ( $options['env']->getDOMFragmentMap() as $k => $fragment ) { $buf .= str_repeat( '=', 15 ) . "\n"; $buf .= "FRAGMENT {$k}\n"; $buf .= self::dumpNode( is_array( $fragment ) ? $fragment[0] : $fragment, $options ) . "\n"; } } if ( empty( $options['quiet'] ) ) { $buf .= str_repeat( '-', mb_strlen( $title ) + 12 ) . "\n"; } return $buf; } } PK ! o{�� � UrlUtils.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Utils; /** * Utilities for manipulating URLs * @see https://tools.ietf.org/html/rfc3986 */ class UrlUtils { /** * Parse a possibly-relative URL into components * * Note no percent-decoding is performed, and only minimal syntax validation. * * @param string $url * @return (string|null)[] * - 'scheme': Scheme of the url, if any. * - 'authority': Authority part of the url, if any. * This is the part in between the "//" and the path. For http, this is the "user@host:port". * - 'path': Path part of the URL. Never null, but may be the empty string. * - 'query': Query part of the URL, if any. * - 'fragment': Fragment part of the URL, if any. */ public static function parseUrl( string $url ): array { $ret = [ 'scheme' => null, 'authority' => null, 'path' => '', 'query' => null, 'fragment' => null, ]; // Scheme? if ( preg_match( '!^([a-z][a-z0-9+.-]*):!i', $url, $m ) ) { $ret['scheme'] = $m[1]; $url = substr( $url, strlen( $m[0] ) ); } // Fragment? $i = strpos( $url, '#' ); if ( $i !== false ) { $ret['fragment'] = substr( $url, $i + 1 ); $url = substr( $url, 0, $i ); } // Query? $i = strpos( $url, '?' ); if ( $i !== false ) { $ret['query'] = substr( $url, $i + 1 ); $url = substr( $url, 0, $i ); } // Split authority and path if ( substr( $url, 0, 2 ) === '//' ) { $i = strpos( $url, '/', 2 ); if ( $i === false ) { $ret['authority'] = substr( $url, 2 ); $ret['path'] = ''; } else { $ret['authority'] = substr( $url, 2, $i - 2 ); $ret['path'] = substr( $url, $i ); } } else { $ret['path'] = $url; } return $ret; } /** * This function will reassemble a URL parsed with self::parseURL(). * * Note no percent-encoding or syntax validation is performed. * * @param array $urlParts URL parts, as output from self::parseUrl * @return string URL assembled from its component parts */ public static function assembleUrl( array $urlParts ): string { $ret = ''; if ( isset( $urlParts['scheme'] ) ) { $ret .= $urlParts['scheme'] . ':'; } if ( isset( $urlParts['authority'] ) ) { $ret .= '//' . $urlParts['authority']; } if ( isset( $urlParts['path'] ) ) { $ret .= $urlParts['path']; } if ( isset( $urlParts['query'] ) ) { $ret .= '?' . $urlParts['query']; } if ( isset( $urlParts['fragment'] ) ) { $ret .= '#' . $urlParts['fragment']; } return $ret; } /** * Remove all dot-segments in the provided URL path. For example, * '/a/./b/../c/' becomes '/a/c/'. * * @see https://tools.ietf.org/html/rfc3986#section-5.2.4 * @note Copied from MediaWiki's UrlUtils::removeDotSegments() * @param string $urlPath URL path, potentially containing dot-segments * @return string URL path with all dot-segments removed */ public static function removeDotSegments( string $urlPath ): string { $output = ''; $inputOffset = 0; $inputLength = strlen( $urlPath ); while ( $inputOffset < $inputLength ) { $prefixLengthOne = substr( $urlPath, $inputOffset, 1 ); $prefixLengthTwo = substr( $urlPath, $inputOffset, 2 ); $prefixLengthThree = substr( $urlPath, $inputOffset, 3 ); $prefixLengthFour = substr( $urlPath, $inputOffset, 4 ); $trimOutput = false; if ( $prefixLengthTwo == './' ) { # Step A, remove leading "./" $inputOffset += 2; } elseif ( $prefixLengthThree == '../' ) { # Step A, remove leading "../" $inputOffset += 3; } elseif ( ( $prefixLengthTwo == '/.' ) && ( $inputOffset + 2 == $inputLength ) ) { # Step B, replace leading "/.$" with "/" $inputOffset += 1; $urlPath[$inputOffset] = '/'; } elseif ( $prefixLengthThree == '/./' ) { # Step B, replace leading "/./" with "/" $inputOffset += 2; } elseif ( $prefixLengthThree == '/..' && ( $inputOffset + 3 == $inputLength ) ) { # Step C, replace leading "/..$" with "/" and # remove last path component in output $inputOffset += 2; $urlPath[$inputOffset] = '/'; $trimOutput = true; } elseif ( $prefixLengthFour == '/../' ) { # Step C, replace leading "/../" with "/" and # remove last path component in output $inputOffset += 3; $trimOutput = true; } elseif ( ( $prefixLengthOne == '.' ) && ( $inputOffset + 1 == $inputLength ) ) { # Step D, remove "^.$" $inputOffset += 1; } elseif ( ( $prefixLengthTwo == '..' ) && ( $inputOffset + 2 == $inputLength ) ) { # Step D, remove "^..$" $inputOffset += 2; } else { # Step E, move leading path segment to output if ( $prefixLengthOne == '/' ) { $slashPos = strpos( $urlPath, '/', $inputOffset + 1 ); } else { $slashPos = strpos( $urlPath, '/', $inputOffset ); } if ( $slashPos === false ) { $output .= substr( $urlPath, $inputOffset ); $inputOffset = $inputLength; } else { $output .= substr( $urlPath, $inputOffset, $slashPos - $inputOffset ); $inputOffset += $slashPos - $inputOffset; } } if ( $trimOutput ) { $slashPos = strrpos( $output, '/' ); if ( $slashPos === false ) { $output = ''; } else { $output = substr( $output, 0, $slashPos ); } } } return $output; } /** * Expand a relative URL using a base URL * * @see https://tools.ietf.org/html/rfc3986#section-5.2.2 * @param string $url Relative URL to expand * @param string $base Base URL to expand relative to * @return string Expanded URL */ public static function expandUrl( string $url, string $base ): string { $b = self::parseUrl( $base ); $r = self::parseUrl( $url ); $t = []; if ( isset( $r['scheme'] ) ) { $t['scheme'] = $r['scheme']; $t['authority'] = $r['authority'] ?? null; $t['path'] = self::removeDotSegments( $r['path'] ); $t['query'] = $r['query'] ?? null; } else { if ( isset( $r['authority'] ) ) { $t['authority'] = $r['authority']; $t['path'] = self::removeDotSegments( $r['path'] ); $t['query'] = $r['query'] ?? null; } else { if ( $r['path'] === '' ) { $t['path'] = $b['path']; $t['query'] = $r['query'] ?? $b['query'] ?? null; } else { if ( $r['path'][0] === '/' ) { $t['path'] = self::removeDotSegments( $r['path'] ); } else { // start merge(), see RFC 3986 §5.2.3 if ( isset( $b['authority'] ) && $b['path'] === '' ) { $t['path'] = '/' . $r['path']; } else { $i = strrpos( $b['path'], '/' ); if ( $i === false ) { $t['path'] = $r['path']; } else { $t['path'] = substr( $b['path'], 0, $i + 1 ) . $r['path']; } } // end merge() $t['path'] = self::removeDotSegments( $t['path'] ); } $t['query'] = $r['query'] ?? null; } $t['authority'] = $b['authority'] ?? null; } $t['scheme'] = $b['scheme'] ?? null; } $t['fragment'] = $r['fragment'] ?? null; return self::assembleUrl( $t ); } /** * Check whether a given URL has a domain that occurs in a given set of domains * * @param string $url * @param array $domains Array of domains (strings) * @return bool True if the host part of $url ends in one of the strings in $domains */ public static function matchesDomainList( string $url, array $domains ): bool { $bits = self::parseUrl( $url ); if ( isset( $bits['authority'] ) ) { $host = '.' . $bits['authority']; foreach ( $domains as $domain ) { $domain = '.' . $domain; if ( substr( $host, -strlen( $domain ) ) === $domain ) { return true; } } } return false; } } PK ! T��^ ^ DOMDataUtils.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Utils; use Composer\Semver\Semver; use stdClass; use Wikimedia\Assert\Assert; use Wikimedia\Assert\UnreachableException; use Wikimedia\JsonCodec\Hint; use Wikimedia\JsonCodec\JsonCodec; use Wikimedia\Parsoid\Core\PageBundle; use Wikimedia\Parsoid\DOM\Document; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\NodeData\DataBag; use Wikimedia\Parsoid\NodeData\DataMw; use Wikimedia\Parsoid\NodeData\DataMwI18n; use Wikimedia\Parsoid\NodeData\DataParsoid; use Wikimedia\Parsoid\NodeData\I18nInfo; use Wikimedia\Parsoid\NodeData\NodeData; use Wikimedia\Parsoid\NodeData\TempData; /** * These helpers pertain to HTML and data attributes of a node. */ class DOMDataUtils { public const DATA_OBJECT_ATTR_NAME = 'data-object-id'; /** * Return the dynamic "bag" property of a Document. * @param Document $doc * @return DataBag */ public static function getBag( Document $doc ): DataBag { // This is a dynamic property; it is not declared. // All references go through here so we can suppress phan's complaint. // @phan-suppress-next-line PhanUndeclaredProperty return $doc->bag; } /** * Return the JsonCodec used for rich attributes in a Document. * @param Document $doc * @return JsonCodec */ public static function getCodec( Document $doc ): JsonCodec { // This is a dynamic property; it is not declared. // All references go through here so we can suppress phan's complaint. // @phan-suppress-next-line PhanUndeclaredProperty return $doc->codec; } public static function prepareDoc( Document $doc ): void { // `bag` is a deliberate dynamic property; see DOMDataUtils::getBag() // @phan-suppress-next-line PhanUndeclaredProperty dynamic property $doc->bag = new DataBag(); // `codec` is a deliberate dynamic property; see DOMDataUtils::getCodec() // @phan-suppress-next-line PhanUndeclaredProperty dynamic property $doc->codec = new JsonCodec(); // Cache the head and body. DOMCompat::getHead( $doc ); DOMCompat::getBody( $doc ); } /** * @param Document $topLevelDoc * @param Document $childDoc */ public static function prepareChildDoc( Document $topLevelDoc, Document $childDoc ) { // @phan-suppress-next-line PhanUndeclaredProperty dynamic property Assert::invariant( $topLevelDoc->bag instanceof DataBag, 'doc bag not set' ); // @phan-suppress-next-line PhanUndeclaredProperty dynamic property $childDoc->bag = $topLevelDoc->bag; // @phan-suppress-next-line PhanUndeclaredProperty dynamic property $childDoc->codec = $topLevelDoc->codec; } /** * Stash $obj in $doc and return an id for later retrieval * @param Document $doc * @param NodeData $obj * @return int */ public static function stashObjectInDoc( Document $doc, NodeData $obj ): int { return self::getBag( $doc )->stashObject( $obj ); } /** * Does this node have any attributes? * @param Element $node * @return bool */ public static function noAttrs( Element $node ): bool { // The 'xmlns' attribute is "invisible" T235295 if ( $node->hasAttribute( 'xmlns' ) ) { return false; } $numAttrs = count( $node->attributes ); return $numAttrs === 0 || ( $numAttrs === 1 && $node->hasAttribute( self::DATA_OBJECT_ATTR_NAME ) ); } /** * Get data object from a node. * * @param Element $node node * @return NodeData */ public static function getNodeData( Element $node ): NodeData { if ( !$node->hasAttribute( self::DATA_OBJECT_ATTR_NAME ) ) { // Initialized on first request $dataObject = new NodeData; self::setNodeData( $node, $dataObject ); return $dataObject; } $nodeId = DOMCompat::getAttribute( $node, self::DATA_OBJECT_ATTR_NAME ); if ( $nodeId !== null ) { $dataObject = self::getBag( $node->ownerDocument )->getObject( (int)$nodeId ); } else { $dataObject = null; // Make phan happy } Assert::invariant( isset( $dataObject ), 'Bogus nodeId given!' ); if ( isset( $dataObject->storedId ) ) { throw new UnreachableException( 'Trying to fetch node data without loading!' . // If this node's data-object id is different from storedId, // it will indicate that the data-parsoid object was shared // between nodes without getting cloned. Useful for debugging. 'Node id: ' . $nodeId . 'Stored data: ' . PHPUtils::jsonEncode( $dataObject ) ); } return $dataObject; } /** * Set node data. * * @param Element $node node * @param NodeData $data data */ public static function setNodeData( Element $node, NodeData $data ): void { $nodeId = self::stashObjectInDoc( $node->ownerDocument, $data ); $node->setAttribute( self::DATA_OBJECT_ATTR_NAME, (string)$nodeId ); } /** * Get data parsoid info from a node. * * @param Element $node node * @return DataParsoid */ public static function getDataParsoid( Element $node ): DataParsoid { $data = self::getNodeData( $node ); $data->parsoid ??= new DataParsoid; return $data->parsoid; } /** * Set data parsoid info on a node. * * @param Element $node node * @param DataParsoid $dp data-parsoid */ public static function setDataParsoid( Element $node, DataParsoid $dp ): void { $data = self::getNodeData( $node ); $data->parsoid = $dp; } /** * Returns the i18n information of a node. This is in private access because it shouldn't * typically be used directly; instead getDataNodeI18n and getDataAttrI18n should be used. * @param Element $node * @return DataMwI18n|null */ private static function getDataMwI18n( Element $node ): ?DataMwI18n { $data = self::getNodeData( $node ); // We won't set a default value for this property return $data->i18n ?? null; } /** * Sets the i18n information of a node. This is in private access because it shouldn't * typically be used directly; instead setDataNodeI18n and setDataAttrI18n should be used. */ private static function setDataMwI18n( Element $node, DataMwI18n $i18n ) { $data = self::getNodeData( $node ); $data->i18n = $i18n; } /** * Retrieves internationalization (i18n) information of a node (typically for localization) * @param Element $node * @return ?I18nInfo */ public static function getDataNodeI18n( Element $node ): ?I18nInfo { $data = self::getNodeData( $node ); // We won't set a default value for this property if ( !isset( $data->i18n ) ) { return null; } return $data->i18n->getSpanInfo(); } /** * Sets internationalization (i18n) information of a node, used for later localization */ public static function setDataNodeI18n( Element $node, I18nInfo $i18n ) { $data = self::getNodeData( $node ); $data->i18n ??= new DataMwI18n(); $data->i18n->setSpanInfo( $i18n ); } /** * Retrieves internationalization (i18n) information of an attribute value (typically for * localization) * @param Element $node * @param string $name * @return ?I18nInfo */ public static function getDataAttrI18n( Element $node, string $name ): ?I18nInfo { $data = self::getNodeData( $node ); // We won't set a default value for this property if ( !isset( $data->i18n ) ) { return null; } return $data->i18n->getAttributeInfo( $name ); } /** * Sets internationalization (i18n) information of a attribute value, used for later * localization * @param Element $node * @param string $name * @param I18nInfo $i18n */ public static function setDataAttrI18n( Element $node, string $name, I18nInfo $i18n ) { $data = self::getNodeData( $node ); $data->i18n ??= new DataMwI18n(); $data->i18n->setAttributeInfo( $name, $i18n ); } /** * @param Element $node * @return array */ public static function getDataAttrI18nNames( Element $node ): array { $data = self::getNodeData( $node ); // We won't set a default value for this property if ( !isset( $data->i18n ) ) { return []; } return $data->i18n->getAttributeNames(); } /** * Get data diff info from a node. * * @param Element $node node * @return ?stdClass */ public static function getDataParsoidDiff( Element $node ): ?stdClass { $data = self::getNodeData( $node ); // We won't set a default value for this property return $data->parsoid_diff ?? null; } /** * Set data diff info on a node. * * @param Element $node node * @param ?stdClass $diffObj data-parsoid-diff object */ public static function setDataParsoidDiff( Element $node, ?stdClass $diffObj ): void { $data = self::getNodeData( $node ); $data->parsoid_diff = $diffObj; } /** * Get data meta wiki info from a node. * * @param Element $node node * @return DataMw */ public static function getDataMw( Element $node ): DataMw { $data = self::getNodeData( $node ); $data->mw ??= new DataMw; return $data->mw; } /** * Set data meta wiki info from a node. * * @param Element $node node * @param ?DataMw $dmw data-mw */ public static function setDataMw( Element $node, ?DataMw $dmw ): void { $data = self::getNodeData( $node ); $data->mw = $dmw; } /** * Check if there is meta wiki info in a node. * * @param Element $node node * @return bool */ public static function validDataMw( Element $node ): bool { return (array)self::getDataMw( $node ) !== []; } /** * Check if there is i18n info on a node (for the node or its attributes) * @param Element $node * @return bool */ public static function validDataMwI18n( Element $node ): bool { return self::getDataMwI18n( $node ) !== null; } /** * Get an object from a JSON-encoded XML attribute on a node. * * @param Element $node node * @param string $name name * @param mixed $defaultVal * @return mixed */ public static function getJSONAttribute( Element $node, string $name, $defaultVal ) { $attVal = DOMCompat::getAttribute( $node, $name ); if ( $attVal === null ) { return $defaultVal; } $decoded = PHPUtils::jsonDecode( $attVal, false ); if ( $decoded !== null ) { return $decoded; } else { error_log( 'ERROR: Could not decode attribute-val ' . $attVal . ' for ' . $name . ' on node ' . DOMCompat::nodeName( $node ) ); return $defaultVal; } } /** * Set a attribute on a node with a JSON-encoded object. * * @param Element $node node * @param string $name Name of the attribute. * @param mixed $obj value of the attribute to */ public static function setJSONAttribute( Element $node, string $name, $obj ): void { $val = $obj === [] ? '{}' : PHPUtils::jsonEncode( $obj ); $node->setAttribute( $name, $val ); } /** * Set shadow info on a node; similar to the method on tokens. * Records a key = value pair in data-parsoid['a'] property. * * This is effectively a call of 'setShadowInfoIfModified' except * there is no original value, so by definition, $val is modified. * * @param Element $node node * @param string $name Name of the attribute. * @param mixed $val val */ public static function setShadowInfo( Element $node, string $name, $val ): void { $dp = self::getDataParsoid( $node ); $dp->a ??= []; $dp->sa ??= []; $dp->a[$name] = $val; } /** * Set shadow info on a node; similar to the method on tokens. * * If the new value ($val) for the key ($name) is different from the * original value ($origVal): * - the new value is recorded in data-parsoid->a and * - the original value is recorded in data-parsoid->sa * * @param Element $node node * @param string $name Name of the attribute. * @param mixed $val val * @param mixed $origVal original value (null is a valid value) * @param bool $skipOrig */ public static function setShadowInfoIfModified( Element $node, string $name, $val, $origVal, bool $skipOrig = false ): void { if ( !$skipOrig && ( $val === $origVal || $origVal === null ) ) { return; } $dp = self::getDataParsoid( $node ); $dp->a ??= []; $dp->sa ??= []; // FIXME: This is a hack to not overwrite already shadowed info. // We should either fix the call site that depends on this // behaviour to do an explicit check, or double down on this // by porting it to the token method as well. if ( !$skipOrig && !array_key_exists( $name, $dp->a ) ) { $dp->sa[$name] = $origVal; } $dp->a[$name] = $val; } /** * Set an attribute and shadow info to a node. * Similar to the method on tokens * * @param Element $node node * @param string $name Name of the attribute. * @param mixed $val value * @param mixed $origVal original value * @param bool $skipOrig */ public static function addNormalizedAttribute( Element $node, string $name, $val, $origVal, bool $skipOrig = false ): void { if ( $name === 'id' ) { DOMCompat::setIdAttribute( $node, $val ); } else { $node->setAttribute( $name, $val ); } self::setShadowInfoIfModified( $node, $name, $val, $origVal, $skipOrig ); } /** * Get this document's pagebundle object * @param Document $doc * @return PageBundle */ public static function getPageBundle( Document $doc ): PageBundle { return self::getBag( $doc )->getPageBundle(); } /** * Removes the `data-*` attribute from a node, and migrates the data to the * document's JSON store. Generates a unique id with the following format: * ``` * mw<base64-encoded counter> * ``` * but attempts to keep user defined ids. * * TODO: Note that $data is effective a partial PageBundle containing * only the 'parsoid' and 'mw' properties. * * @param Element $node node * @param stdClass $data data * @param array $idIndex Index of used id attributes in the DOM */ public static function storeInPageBundle( Element $node, stdClass $data, array $idIndex ): void { $hints = self::getCodecHints(); $uid = DOMCompat::getAttribute( $node, 'id' ); $document = $node->ownerDocument; $pb = self::getPageBundle( $document ); $codec = self::getCodec( $document ); $docDp = &$pb->parsoid; $origId = $uid; if ( $uid !== null && array_key_exists( $uid, $docDp['ids'] ) ) { $uid = null; } if ( $uid === '' ) { $uid = null; } if ( $uid === null ) { do { $docDp['counter'] += 1; // PORT-FIXME: NOTE that we aren't updating the idIndex here because // we are generating unique ids that will not conflict. In any case, // the idIndex is a workaround for the PHP DOM's issues and we might // switch out of this in the future anyway. $uid = 'mw' . PHPUtils::counterToBase64( $docDp['counter'] ); } while ( isset( $idIndex[$uid] ) ); self::addNormalizedAttribute( $node, 'id', $uid, $origId ); } // Convert from DataParsoid/DataMw objects to associative array $docDp['ids'][$uid] = $codec->toJsonArray( $data->parsoid, $hints['data-parsoid'] ); if ( isset( $data->mw ) ) { $pb->mw['ids'][$uid] = $codec->toJsonArray( $data->mw, $hints['data-mw'] ); } } /** * Helper function to create static Hint objects for JsonCodec. * @return array<Hint> */ public static function getCodecHints(): array { static $hints = null; if ( $hints === null ) { $hints = [ 'data-parsoid' => Hint::build( DataParsoid::class, Hint::ALLOW_OBJECT ), 'data-mw' => Hint::build( DataMw::class, Hint::ALLOW_OBJECT ), ]; } return $hints; } /** * @param Document $doc doc * @param PageBundle $pb object */ public static function injectPageBundle( Document $doc, PageBundle $pb ): void { $script = DOMUtils::appendToHead( $doc, 'script', [ 'id' => 'mw-pagebundle', 'type' => 'application/x-mw-pagebundle', ] ); $script->appendChild( $doc->createTextNode( $pb->encodeForHeadElement() ) ); } /** * @param Document $doc doc * @return stdClass|null */ public static function extractPageBundle( Document $doc ): ?stdClass { $pb = null; $dpScriptElt = DOMCompat::getElementById( $doc, 'mw-pagebundle' ); if ( $dpScriptElt ) { $dpScriptElt->parentNode->removeChild( $dpScriptElt ); // we actually want arrays in the page bundle rather than stdClasses; but we still // want to access the object properties $pb = (object)PHPUtils::jsonDecode( $dpScriptElt->textContent ); } return $pb; } /** * Walk DOM from node downward calling loadDataAttribs * * @param Node $node node * @param array $options options */ public static function visitAndLoadDataAttribs( Node $node, array $options = [] ): void { DOMUtils::visitDOM( $node, [ self::class, 'loadDataAttribs' ], $options ); } /** * These are intended be used on a document after post-processing, so that * the underlying .dataobject is transparently applied (in the store case) * and reloaded (in the load case), rather than worrying about keeping * the attributes up-to-date throughout that phase. For the most part, * using this.ppTo* should be sufficient and using these directly should be * avoided. * * @param Node $node node * @param array $options options */ public static function loadDataAttribs( Node $node, array $options ): void { if ( !( $node instanceof Element ) ) { return; } // Reset the node data object's stored state, since we're reloading it self::setNodeData( $node, new NodeData ); $codec = self::getCodec( $node->ownerDocument ); $dataParsoidAttr = DOMCompat::getAttribute( $node, 'data-parsoid' ); $dp = $codec->newFromJsonString( $dataParsoidAttr ?? '{}', self::getCodecHints()['data-parsoid'] ); if ( !empty( $options['markNew'] ) ) { $dp->setTempFlag( TempData::IS_NEW, $dataParsoidAttr === null ); } self::setDataParsoid( $node, $dp ); $node->removeAttribute( 'data-parsoid' ); $dataMwAttr = DOMCompat::getAttribute( $node, 'data-mw' ); $dmw = $dataMwAttr === null ? null : $codec->newFromJsonString( $dataMwAttr, self::getCodecHints()['data-mw'] ); self::setDataMw( $node, $dmw ); $node->removeAttribute( 'data-mw' ); $dpd = self::getJSONAttribute( $node, 'data-parsoid-diff', null ); self::setDataParsoidDiff( $node, $dpd ); $node->removeAttribute( 'data-parsoid-diff' ); $dataI18n = DOMCompat::getAttribute( $node, 'data-mw-i18n' ); if ( $dataI18n !== null ) { $i18n = DataMwI18n::fromJson( PHPUtils::jsonDecode( $dataI18n, true ) ); self::setDataMwI18n( $node, $i18n ); $node->removeAttribute( 'data-mw-i18n' ); } } /** * Builds an index of id attributes seen in the DOM * @param Node $node * @return array */ public static function usedIdIndex( Node $node ): array { $index = []; DOMUtils::visitDOM( DOMCompat::getBody( $node->ownerDocument ), static function ( Node $n, ?array $options = null ) use ( &$index ) { if ( $n instanceof Element ) { $id = DOMCompat::getAttribute( $n, 'id' ); if ( $id !== null ) { $index[$id] = true; } } }, [] ); return $index; } /** * Walk DOM from node downward calling storeDataAttribs * * @param Node $node node * @param array $options options */ public static function visitAndStoreDataAttribs( Node $node, array $options = [] ): void { // PORT-FIXME: storeDataAttribs calls storeInPageBundle which calls getElementById. // PHP's `getElementById` implementation is broken, and we work around that by // using Zest which uses XPath. So, getElementById call can be O(n) and calling it // on on every element of the DOM via vistDOM here makes it O(n^2) instead of O(n). // So, we work around that by building an index and avoiding getElementById entirely // in storeInPageBundle. if ( !empty( $options['storeInPageBundle'] ) ) { $options['idIndex'] = self::usedIdIndex( $node ); } DOMUtils::visitDOM( $node, [ self::class, 'storeDataAttribs' ], $options ); } /** * Copy data attributes from the bag to either JSON-encoded attributes on * each node, or the page bundle, erasing the data-object-id attributes. * * @param Node $node node * @param ?array $options options * - discardDataParsoid: Discard DataParsoid objects instead of storing them * - keepTmp: Preserve DataParsoid::$tmp * - storeInPageBundle: If true, data will be stored in the page bundle * instead of data-parsoid and data-mw. * - env: The Env object required for various features * - idIndex: Array of used ID attributes */ public static function storeDataAttribs( Node $node, ?array $options = null ): void { $hints = self::getCodecHints(); $options ??= []; if ( !( $node instanceof Element ) ) { return; } Assert::invariant( empty( $options['discardDataParsoid'] ) || empty( $options['keepTmp'] ), 'Conflicting options: discardDataParsoid and keepTmp are both enabled.' ); $codec = self::getCodec( $node->ownerDocument ); $dp = self::getDataParsoid( $node ); $discardDataParsoid = !empty( $options['discardDataParsoid'] ); if ( $dp->getTempFlag( TempData::IS_NEW ) ) { // Only necessary to support the cite extension's getById, // that's already been loaded once. // // This is basically a hack to ensure that DOMUtils.isNewElt // continues to work since we effectively rely on the absence // of data-parsoid to identify new elements. But, loadDataAttribs // creates an empty {} if one doesn't exist. So, this hack // ensures that a loadDataAttribs + storeDataAttribs pair don't // dirty the node by introducing an empty data-parsoid attribute // where one didn't exist before. // // Ideally, we'll find a better solution for this edge case later. $discardDataParsoid = true; } $data = null; if ( !$discardDataParsoid ) { if ( empty( $options['keepTmp'] ) ) { // @phan-suppress-next-line PhanTypeObjectUnsetDeclaredProperty unset( $dp->tmp ); } if ( !empty( $options['storeInPageBundle'] ) ) { $data ??= new stdClass; $data->parsoid = $dp; } else { $node->setAttribute( 'data-parsoid', PHPUtils::jsonEncode( $codec->toJsonArray( $dp, $hints['data-parsoid'] ) ) ); } } // Strip invalid data-mw attributes if ( self::validDataMw( $node ) ) { if ( !empty( $options['storeInPageBundle'] ) && isset( $options['env'] ) && // The pagebundle didn't have data-mw before 999.x Semver::satisfies( $options['env']->getOutputContentVersion(), '^999.0.0' ) ) { $data ??= new stdClass; $data->mw = self::getDataMw( $node ); } else { $node->setAttribute( 'data-mw', PHPUtils::jsonEncode( $codec->toJsonArray( self::getDataMw( $node ), $hints['data-mw'] ) ) ); } } if ( self::validDataMwI18n( $node ) ) { self::setJSONAttribute( $node, 'data-mw-i18n', self::getDataMwI18n( $node ) ); } // Store pagebundle if ( $data !== null ) { self::storeInPageBundle( $node, $data, $options['idIndex'] ); } // Indicate that this node's data has been stored so that if we try // to access it after the fact we're aware and remove the attribute // since it's no longer needed. $nd = self::getNodeData( $node ); $id = DOMCompat::getAttribute( $node, self::DATA_OBJECT_ATTR_NAME ); $nd->storedId = $id !== null ? intval( $id ) : null; $node->removeAttribute( self::DATA_OBJECT_ATTR_NAME ); } /** * Clones a node and its data bag * @param Element $elt * @param bool $deep * @return Element */ public static function cloneNode( Element $elt, bool $deep ): Element { $clone = $elt->cloneNode( $deep ); '@phan-var Element $clone'; // @var Element $clone // We do not need to worry about $deep because a shallow clone does not have child nodes, // so it's always cloning data on the cloned tree (which may be empty). self::fixClonedData( $clone ); return $clone; } /** * Recursively fixes cloned data from $elt: to avoid conflicts of element IDs, we clone the * data and set it in the node with a new element ID (which setNodeData does). * @param Element $elt */ private static function fixClonedData( Element $elt ) { if ( $elt->hasAttribute( self::DATA_OBJECT_ATTR_NAME ) ) { self::setNodeData( $elt, self::getNodeData( $elt )->cloneNodeData() ); } foreach ( $elt->childNodes as $child ) { if ( $child instanceof Element ) { self::fixClonedData( $child ); } } } } PK ! ͧ� � DOMPostOrder.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Utils; use Wikimedia\Parsoid\DOM\Node; /** * Non-recursive post-order traversal of a DOM tree. */ class DOMPostOrder { /** * @suppress PhanEmptyPrivateMethod */ private function __construct() { /* Not meant to be instantiated. */ } /** * Non-recursive post-order traversal of a DOM tree. * @param Node $root * @param callable $visitFunc Called in post-order on each node. */ public static function traverse( Node $root, callable $visitFunc ): void { $node = $root; while ( true ) { // Find leftmost (grand)child, and visit that first. while ( $node->firstChild ) { $node = $node->firstChild; } while ( true ) { $visitFunc( $node ); if ( $node === $root ) { return; // Visiting the root is the last thing we do. } /* Look for right sibling to continue traversal. */ if ( $node->nextSibling ) { $node = $node->nextSibling; /* Loop back and visit its leftmost (grand)child first. */ break; } /* Visit parent only after we've run out of right siblings. */ $node = $node->parentNode; } } } } PK ! DyG�B2 B2 PHPUtils.phpnu �Iw�� PK ! {z0�q� q� ~2 WTUtils.phpnu �Iw�� PK ! �S �S *� Utils.phpnu �Iw�� PK ! <(#R/ / � ComputeSelectiveStats.phpnu �Iw�� PK ! �Q�@ v6 TitleValue.phpnu �Iw�� PK ! Y��J�0 �0 �? Title.phpnu �Iw�� PK ! N��H� � �p DTState.phpnu �Iw�� PK ! ܍�.�k �k Gt DOMUtils.phpnu �Iw�� PK ! g���� � � DiffDOMUtils.phpnu �Iw�� PK ! %O�Aq q �� ConfigUtils.phpnu �Iw�� PK ! �S�Z8 8 �� Timing.phpnu �Iw�� PK ! �G�ܳ0 �0 ScriptUtils.phpnu �Iw�� PK ! �xQ.a .a �5 PipelineUtils.phpnu �Iw�� PK ! �&�o! o! d� DOMTraverser.phpnu �Iw�� PK ! sٟ�D �D � DOMCompat.phpnu �Iw�� PK ! �gG��X �X � TokenUtils.phpnu �Iw�� PK ! Be�� � �V CompatJsonCodec.phpnu �Iw�� PK ! �z� � b TitleException.phpnu �Iw�� PK ! �Z�-d d (d DOMCompat/TokenList.phpnu �Iw�� PK ! �FC��7 �7 �s ContentUtils.phpnu �Iw�� PK ! o{�� � �� UrlUtils.phpnu �Iw�� PK ! T��^ ^ �� DOMDataUtils.phpnu �Iw�� PK ! ͧ� � ( DOMPostOrder.phpnu �Iw�� PK �,
| ver. 1.1 | |
.
| PHP 8.4.18 | Ð“ÐµÐ½ÐµÑ€Ð°Ñ†Ð¸Ñ Ñтраницы: 0.02 |
proxy
|
phpinfo
|
ÐаÑтройка