Файловый менеджер - Редактировать - /var/www/html/Processors.zip
Ðазад
PK ! ��4LO� O� DOMRangeBuilder.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; use Error; use SplObjectStorage; use Wikimedia\Assert\Assert; use Wikimedia\Assert\UnreachableException; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\Core\DomSourceRange; use Wikimedia\Parsoid\Core\ElementRange; use Wikimedia\Parsoid\DOM\Document; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\DOM\Text; use Wikimedia\Parsoid\NodeData\DataParsoid; use Wikimedia\Parsoid\NodeData\TempData; use Wikimedia\Parsoid\NodeData\TemplateInfo; use Wikimedia\Parsoid\Utils\DOMCompat; use Wikimedia\Parsoid\Utils\DOMDataUtils; use Wikimedia\Parsoid\Utils\DOMUtils; use Wikimedia\Parsoid\Utils\PHPUtils; use Wikimedia\Parsoid\Utils\Utils; use Wikimedia\Parsoid\Utils\WTUtils; use Wikimedia\Parsoid\Wt2Html\Frame; /** * Template encapsulation happens in three steps. * * 1. findWrappableTemplateRanges * * Locate start and end metas. Walk upwards towards the root from both and * find a common ancestor A. The subtree rooted at A is now effectively the * scope of the dom template ouput. * * 2. findTopLevelNonOverlappingRanges * * Mark all nodes in a range and walk up to root from each range start to * determine overlaps, nesting. Merge overlapping and nested ranges to find * the subset of top-level non-overlapping ranges which will be wrapped as * individual units. * * 3. encapsulateTemplates * * For each non-overlapping range, * - compute a data-mw according to the DOM spec * - replace the start / end meta markers with transclusion type and data-mw * on the first DOM node * - add about ids on all top-level nodes of the range * * This is a simple high-level overview of the 3 steps to help understand this * code. * * FIXME: At some point, more of the details should be extracted and documented * in pseudo-code as an algorithm. * @module */ class DOMRangeBuilder { private const MAP_TBODY_TR = [ 'tbody' => true, 'tr' => true ]; /** @var Document */ private $document; /** @var Frame */ private $frame; /** @var Env */ protected $env; /** @var SplObjectStorage */ protected $nodeRanges; /** @var array<string|CompoundTemplateInfo>[] */ private $compoundTpls = []; /** @var string */ protected $traceType; public function __construct( Document $document, Frame $frame ) { $this->document = $document; $this->frame = $frame; $this->env = $frame->getEnv(); $this->nodeRanges = new SplObjectStorage; $this->traceType = "tplwrap"; } protected function updateDSRForFirstRangeNode( Element $target, Element $source ): void { $srcDP = DOMDataUtils::getDataParsoid( $source ); $tgtDP = DOMDataUtils::getDataParsoid( $target ); // Since TSRs on template content tokens are cleared by the // template handler, all computed dsr values for template content // is always inferred from top-level content values and is safe. // So, do not overwrite a bigger end-dsr value. if ( isset( $srcDP->dsr->end ) && isset( $tgtDP->dsr->end ) && $tgtDP->dsr->end > $srcDP->dsr->end ) { $tgtDP->dsr->start = $srcDP->dsr->start ?? null; } else { $tgtDP->dsr = clone $srcDP->dsr; $tgtDP->src = $srcDP->src ?? null; } } /** * Get the DSR of the end of a DOMRange * * @param DOMRangeInfo $range * @return DomSourceRange|null */ private static function getRangeEndDSR( DOMRangeInfo $range ): ?DomSourceRange { $endNode = $range->end; if ( $endNode instanceof Element ) { return DOMDataUtils::getDataParsoid( $endNode )->dsr ?? null; } else { // In the rare scenario where the last element of a range is not an ELEMENT, // extrapolate based on DSR of first leftmost sibling that is an ELEMENT. // We don't try any harder than this for now. $offset = 0; $n = $endNode->previousSibling; while ( $n && !( $n instanceof Element ) ) { if ( $n instanceof Text ) { $offset += strlen( $n->nodeValue ); } else { // A comment // @phan-suppress-next-line PhanTypeMismatchArgumentSuperType $offset += WTUtils::decodedCommentLength( $n ); } $n = $n->previousSibling; } $dsr = null; if ( $n ) { /** * The point of the above loop is to ensure we're working * with a Element if there is an $n. * * @var Element $n */ '@phan-var Element $n'; $dsr = DOMDataUtils::getDataParsoid( $n )->dsr ?? null; } if ( $dsr && is_int( $dsr->end ?? null ) ) { $len = $endNode instanceof Text ? strlen( $endNode->nodeValue ) // A comment // @phan-suppress-next-line PhanTypeMismatchArgumentSuperType : WTUtils::decodedCommentLength( $endNode ); $dsr = new DomSourceRange( $dsr->end + $offset, $dsr->end + $offset + $len, null, null ); } return $dsr; } } /** * Returns the range ID of a node - in the case of templates, its "about" attribute. * @param Element $node * @return string */ protected function getRangeId( Element $node ): string { return DOMCompat::getAttribute( $node, "about" ); } /** * Find the common DOM ancestor of two DOM nodes. * * @param Element $startMeta * @param Element $endMeta * @param Element $endElem * @return DOMRangeInfo */ private function getDOMRange( Element $startMeta, Element $endMeta, Element $endElem ) { $range = $this->findEnclosingRange( $startMeta, $endMeta, $endElem ); $startsInFosterablePosn = DOMUtils::isFosterablePosition( $range->start ); $next = $range->start->nextSibling; // Detect empty content and handle them! if ( WTUtils::isTplMarkerMeta( $range->start ) && $next === $endElem ) { Assert::invariant( $range->start === $range->startElem, "Expected startElem to be same as range.start" ); if ( $startsInFosterablePosn ) { // Expand range! $range->start = $range->end = $range->start->parentNode; $startsInFosterablePosn = false; } else { $emptySpan = $this->document->createElement( 'span' ); $range->start->parentNode->insertBefore( $emptySpan, $endElem ); } // Handle unwrappable content in fosterable positions // and expand template range, if required. // NOTE: Template marker meta tags are translated from comments // *after* the DOM has been built which is why they can show up in // fosterable positions in the DOM. } elseif ( $startsInFosterablePosn && ( !( $range->start instanceof Element ) || ( WTUtils::isTplMarkerMeta( $range->start ) && ( !( $next instanceof Element ) || WTUtils::isTplMarkerMeta( $next ) ) ) ) ) { $rangeStartParent = $range->start->parentNode; // If we are in a table in a foster-element position, then all non-element // nodes will be white-space and comments. Skip over all of them and find // the first table content node. $noWS = true; $nodesToMigrate = []; $newStart = $range->start; $n = $range->start instanceof Element ? $next : $range->start; while ( !( $n instanceof Element ) ) { if ( $n instanceof Text ) { $noWS = false; } $nodesToMigrate[] = $n; $n = $n->nextSibling; $newStart = $n; } // As long as $newStart is a tr/tbody or we don't have whitespace // migrate $nodesToMigrate into $newStart. Pushing whitespace into // th/td/caption can change display semantics. if ( $newStart && ( $noWS || isset( self::MAP_TBODY_TR[DOMCompat::nodeName( $newStart )] ) ) ) { /** * The point of the above loop is to ensure we're working * with a Element if there is a $newStart. * * @var Element $newStart */ '@phan-var Element $newStart'; $insertPosition = $newStart->firstChild; foreach ( $nodesToMigrate as $n ) { $newStart->insertBefore( $n, $insertPosition ); } $range->start = $newStart; // Update dsr to point to original start $this->updateDSRForFirstRangeNode( $range->start, $range->startElem ); } else { // If not, we are forced to expand the template range. $range->start = $range->end = $rangeStartParent; } } // Ensure range->start is an element node since we want to // add/update the data-parsoid attribute to it. if ( !( $range->start instanceof Element ) ) { $span = $this->document->createElement( 'span' ); $range->start->parentNode->insertBefore( $span, $range->start ); $span->appendChild( $range->start ); $range->start = $span; $this->updateDSRForFirstRangeNode( $range->start, $range->startElem ); } $range->start = $this->getStartConsideringFosteredContent( $range->start ); // Use the negative test since it doesn't mark the range as flipped // if range.start === range.end if ( !DOMUtils::inSiblingOrder( $range->start, $range->end ) ) { // In foster-parenting situations, the end-meta tag (and hence range.end) // can show up before the range.start which would be the table itself. // So, we record this info for later analysis. $range->flipped = true; } $this->env->log( "trace/{$this->traceType}/findranges", static function () use ( &$range ) { $msg = ''; $dp1 = DOMDataUtils::getDataParsoid( $range->start ); $dp2 = DOMDataUtils::getDataParsoid( $range->end ); $tmp1 = $dp1->tmp; $tmp2 = $dp2->tmp; $dp1->tmp = null; $dp2->tmp = null; $msg .= "\n----------------------------------------------"; $msg .= "\nFound range : " . $range->id . '; flipped? ' . ( (string)$range->flipped ) . '; offset: ' . $range->startOffset; $msg .= "\nstart-elem : " . DOMCompat::getOuterHTML( $range->startElem ) . '; DP: ' . PHPUtils::jsonEncode( DOMDataUtils::getDataParsoid( $range->startElem ) ); $msg .= "\nend-elem : " . DOMCompat::getOuterHTML( $range->endElem ) . '; DP: ' . PHPUtils::jsonEncode( DOMDataUtils::getDataParsoid( $range->endElem ) ); $msg .= "\nstart : [TAG_ID " . ( $tmp1->tagId ?? '?' ) . ']: ' . DOMCompat::getOuterHTML( $range->start ) . '; DP: ' . PHPUtils::jsonEncode( $dp1 ); $msg .= "\nend : [TAG_ID " . ( $tmp2->tagId ?? '?' ) . ']: ' . DOMCompat::getOuterHTML( $range->end ) . '; DP: ' . PHPUtils::jsonEncode( $dp2 ); $msg .= "\n----------------------------------------------"; $dp1->tmp = $tmp1; $dp2->tmp = $tmp2; return $msg; } ); return $range; } /** * Returns the current node if it's not just after fostered content, the first node * of fostered content otherwise. * @param Node $node * @return Node */ protected function getStartConsideringFosteredContent( Node $node ): Node { if ( DOMCompat::nodeName( $node ) === 'table' ) { // If we have any fostered content, include it as well. for ( $previousSibling = $node->previousSibling; $previousSibling instanceof Element && !empty( DOMDataUtils::getDataParsoid( $previousSibling )->fostered ); $previousSibling = $node->previousSibling ) { $node = $previousSibling; } } return $node; } private static function stripStartMeta( Element $meta ): void { if ( DOMCompat::nodeName( $meta ) === 'meta' ) { $meta->parentNode->removeChild( $meta ); } else { // Remove mw:* from the typeof. $type = DOMCompat::getAttribute( $meta, 'typeof' ); if ( $type !== null ) { $type = preg_replace( '/(?:^|\s)mw:[^\/]*(\/\S+|(?=$|\s))/D', '', $type ); $meta->setAttribute( 'typeof', $type ); } } } private static function findToplevelEnclosingRange( array $nestingInfo, ?string $startId ): ?string { // Walk up the implicit nesting tree to find the // top-level range within which rId is nested. // No cycles can exist since they have been suppressed. $visited = []; $rId = $startId; while ( isset( $nestingInfo[$rId] ) ) { if ( isset( $visited[$rId] ) ) { throw new Error( "Found a cycle in tpl-range nesting where there shouldn't have been one." ); } $visited[$rId] = true; $rId = $nestingInfo[$rId]; } return $rId; } /** * Add a template to $this->compoundTpls * * @param string $compoundTplId * @param DOMRangeInfo $range * @param TemplateInfo $templateInfo */ private function recordTemplateInfo( string $compoundTplId, DOMRangeInfo $range, TemplateInfo $templateInfo ): void { $this->compoundTpls[$compoundTplId] ??= []; // Record template args info along with any intervening wikitext // between templates that are part of the same compound structure. /** @var array $tplArray */ $tplArray = &$this->compoundTpls[$compoundTplId]; $dp = DOMDataUtils::getDataParsoid( $range->startElem ); $dsr = $dp->dsr; if ( count( $tplArray ) > 0 ) { $prevTplInfo = PHPUtils::lastItem( $tplArray ); if ( $prevTplInfo->dsr->end < $dsr->start ) { $width = $dsr->start - $prevTplInfo->dsr->end; $tplArray[] = PHPUtils::safeSubstr( $this->frame->getSrcText(), $prevTplInfo->dsr->end, $width ); } } if ( !empty( $dp->unwrappedWT ) ) { $tplArray[] = (string)$dp->unwrappedWT; } // Get rid of src-offsets since they aren't needed anymore. foreach ( $templateInfo->paramInfos as $pi ) { $pi->srcOffsets = null; } $tplArray[] = new CompoundTemplateInfo( $dsr, $templateInfo, DOMUtils::hasTypeOf( $range->startElem, 'mw:Param' ) ); } /** * Determine whether adding the given range would introduce a cycle in the * subsumedRanges graph. * * Nesting cycles with multiple ranges can show up because of foster * parenting scenarios if they are not detected and suppressed. * * @param string $start The ID of the new range * @param string $end The ID of the other range * @param string[] $subsumedRanges The subsumed ranges graph, encoded as an * array in which each element maps one string range ID to another range ID * @return bool */ private static function introducesCycle( string $start, string $end, array $subsumedRanges ): bool { $visited = [ $start => true ]; $elt = $subsumedRanges[$end] ?? null; while ( $elt ) { if ( !empty( $visited[$elt] ) ) { return true; } $elt = $subsumedRanges[$elt] ?? null; } return false; } /** * Determine whether DOM ranges overlap. * * The `inSiblingOrder` check here is sufficient to determine overlaps * because the algorithm in `findWrappableTemplateRanges` will put the * start/end elements for intersecting ranges on the same plane and prev/ * curr are in textual order (which translates to dom order). * * @param DOMRangeInfo $prev * @param DOMRangeInfo $curr * @return bool */ private static function rangesOverlap( DOMRangeInfo $prev, DOMRangeInfo $curr ): bool { $prevEnd = ( !$prev->flipped ) ? $prev->end : $prev->start; $currStart = ( !$curr->flipped ) ? $curr->start : $curr->end; return DOMUtils::inSiblingOrder( $currStart, $prevEnd ); } /** * Identify the elements of $tplRanges that are non-overlapping. * Record template info in $this->compoundTpls as we go. * * @param Node $docRoot * @param DOMRangeInfo[] $tplRanges The potentially overlapping ranges * @return DOMRangeInfo[] The non-overlapping ranges */ public function findTopLevelNonOverlappingRanges( Node $docRoot, array $tplRanges ): array { // For each node, assign an attribute that is a record of all // tpl ranges it belongs to at the top-level. foreach ( $tplRanges as $r ) { $n = !$r->flipped ? $r->start : $r->end; $e = !$r->flipped ? $r->end : $r->start; while ( $n ) { if ( $n instanceof Element ) { $this->addNodeRange( $n, $r ); // Done if ( $n === $e ) { break; } } $n = $n->nextSibling; } } // In the first pass over `numRanges` below, `subsumedRanges` is used to // record purely the nested ranges. However, in the second pass, we also // add the relationships between overlapping ranges so that // `findToplevelEnclosingRange` can use that information to add `argInfo` // to the right `compoundTpls`. This scenario can come up when you have // three ranges, 1 intersecting with 2 but not 3, and 3 nested in 2. $subsumedRanges = []; // For each range r:(s, e), walk up from s --> docRoot and if any of // these nodes have tpl-ranges (besides r itself) assigned to them, // then r is nested in those other templates and can be ignored. foreach ( $tplRanges as $r ) { $n = $r->start; while ( $n !== $docRoot ) { $ranges = $this->getNodeRanges( $n ); if ( $ranges ) { if ( $n !== $r->start ) { // 'r' is nested for sure // Record the outermost range in which 'r' is nested. $outermostId = null; $outermostOffset = null; foreach ( $ranges as $rangeId => $range ) { if ( $outermostId === null || $range->startOffset < $outermostOffset ) { $outermostId = $rangeId; $outermostOffset = $range->startOffset; } } $subsumedRanges[$r->id] = (string)$outermostId; break; } else { // n === r.start // // We have to make sure this is not an overlap scenario. // Find the ranges that r.start and r.end belong to and // compute their intersection. If this intersection has // another tpl range besides r itself, we have a winner! // // The code below does the above check efficiently. $eTpls = $this->getNodeRanges( $r->end ); $foundNesting = false; foreach ( $ranges as $otherId => $other ) { // - Don't record nesting cycles. // - Record the outermost range in which 'r' is nested in. if ( $otherId !== $r->id && !empty( $eTpls[$otherId] ) && // When we have identical ranges, pick the range with // the larger offset to be subsumed. ( $r->start !== $other->start || $r->end !== $other->end || $other->startOffset < $r->startOffset ) && !self::introducesCycle( $r->id, (string)$otherId, $subsumedRanges ) ) { $foundNesting = true; if ( !isset( $subsumedRanges[$r->id] ) || $other->startOffset < $ranges[$subsumedRanges[$r->id]]->startOffset ) { $subsumedRanges[$r->id] = (string)$otherId; } } } if ( $foundNesting ) { // 'r' is nested break; } } } // Move up $n = $n->parentNode; } } // Sort by start offset in source wikitext usort( $tplRanges, static function ( $r1, $r2 ) { return $r1->startOffset - $r2->startOffset; } ); // Since the tpl ranges are sorted in textual order (by start offset), // it is sufficient to only look at the most recent template to see // if the current one overlaps with the previous one. // // This works because we've already identify nested ranges and can ignore them. $newRanges = []; $prev = null; foreach ( $tplRanges as $r ) { $endTagToRemove = null; $startTagToStrip = null; // Extract tplargInfo $tmp = DOMDataUtils::getDataParsoid( $r->startElem )->getTemp(); $templateInfo = $tmp->tplarginfo ?? null; $this->verifyTplInfoExpectation( $templateInfo, $tmp ); $this->env->log( "trace/{$this->traceType}/merge", static function () use ( &$DOMDataUtils, &$r ) { $msg = ''; $dp1 = DOMDataUtils::getDataParsoid( $r->start ); $dp2 = DOMDataUtils::getDataParsoid( $r->end ); $tmp1 = $dp1->tmp; $tmp2 = $dp2->tmp; $dp1->tmp = null; $dp2->tmp = null; $msg .= "\n##############################################"; $msg .= "\nrange " . $r->id . '; r-start-elem: ' . DOMCompat::getOuterHTML( $r->startElem ) . '; DP: ' . PHPUtils::jsonEncode( DOMDataUtils::getDataParsoid( $r->startElem ) ); $msg .= "\nrange " . $r->id . '; r-end-elem: ' . DOMCompat::getOuterHTML( $r->endElem ) . '; DP: ' . PHPUtils::jsonEncode( DOMDataUtils::getDataParsoid( $r->endElem ) ); $msg .= "\nrange " . $r->id . '; r-start: [TAG_ID ' . ( $tmp1->tagId ?? '?' ) . ']: ' . DOMCompat::getOuterHTML( $r->start ) . '; DP: ' . PHPUtils::jsonEncode( $dp1 ); $msg .= "\nrange " . $r->id . '; r-end: [TAG_ID ' . ( $tmp2->tagId ?? '?' ) . ']: ' . DOMCompat::getOuterHTML( $r->end ) . '; DP: ' . PHPUtils::jsonEncode( $dp2 ); $msg .= "\n----------------------------------------------"; $dp1->tmp = $tmp1; $dp2->tmp = $tmp2; return $msg; } ); $enclosingRangeId = self::findToplevelEnclosingRange( $subsumedRanges, $subsumedRanges[$r->id] ?? null ); if ( $enclosingRangeId ) { $this->env->log( "trace/{$this->traceType}/merge", '--nested in ', $enclosingRangeId, '--' ); // Nested -- ignore r $startTagToStrip = $r->startElem; $endTagToRemove = $r->endElem; if ( $templateInfo ) { // 'r' is nested in 'enclosingRange' at the top-level // So, enclosingRange gets r's argInfo $this->recordTemplateInfo( $enclosingRangeId, $r, $templateInfo ); } } elseif ( $prev && self::rangesOverlap( $prev, $r ) ) { // In the common case, in overlapping scenarios, r.start is // identical to prev.end. However, in fostered content scenarios, // there can true overlap of the ranges. $this->env->log( "trace/{$this->traceType}/merge", '--overlapped--' ); // See comment above, where `subsumedRanges` is defined. $subsumedRanges[$r->id] = $prev->id; // Overlapping ranges. // r is the regular kind // Merge r with prev // Note that if a table comes from a template, a foster box isn't // emitted so the enclosure isn't guaranteed. In pathological // cases, like where the table end tag isn't emitted, we can still // end up with flipped ranges if the template end marker gets into // a fosterable position (which can still happen despite being // emitted as a comment). Assert::invariant( !$r->flipped, 'Flipped range should have been enclosed.' ); $startTagToStrip = $r->startElem; $endTagToRemove = $prev->endElem; $prev->end = $r->end; $prev->endElem = $r->endElem; if ( WTUtils::isMarkerAnnotation( $r->endElem ) ) { $endDataMw = DOMDataUtils::getDataMw( $r->endElem ); $endDataMw->rangeId = $r->id; $prev->extendedByOverlapMerge = true; } // Update compoundTplInfo if ( $templateInfo ) { $this->recordTemplateInfo( $prev->id, $r, $templateInfo ); } } else { $this->env->log( "trace/{$this->traceType}/merge", '--normal--' ); // Default -- no overlap // Emit the merged range $newRanges[] = $r; $prev = $r; // Update compoundTpls if ( $templateInfo ) { $this->recordTemplateInfo( $r->id, $r, $templateInfo ); } } if ( $endTagToRemove ) { // Remove start and end meta-tags // Not necessary to remove the start tag, but good to cleanup $endTagToRemove->parentNode->removeChild( $endTagToRemove ); self::stripStartMeta( $startTagToStrip ); } } return $newRanges; } /** * Note that the case of nodeName varies with DOM implementation. This * method currently forces the name nodeName to uppercase. In the future * we can/should switch to using the "native" case of the DOM * implementation; we do a case-insensitive match (by converting the result * to the "native" case of the DOM implementation) in * EncapsulatedContentHandler when this value is used. * @param DOMRangeInfo $range * @return string|null nodeName with an optional "_$stx" suffix. */ private static function findFirstTemplatedNode( DOMRangeInfo $range ): ?string { $firstNode = $range->start; // Skip tpl marker meta if ( WTUtils::isTplMarkerMeta( $firstNode ) ) { $firstNode = $firstNode->nextSibling; } // Walk past fostered nodes since they came from within a table // Note that this is not foolproof because in some scenarios, // fostered content is not marked up. Ex: when a table is templated, // and content from the table is fostered. $dp = DOMDataUtils::getDataParsoid( $firstNode ); while ( !empty( $dp->fostered ) ) { $firstNode = $firstNode->nextSibling; /** @var Element $firstNode */ DOMUtils::assertElt( $firstNode ); $dp = DOMDataUtils::getDataParsoid( $firstNode ); } // FIXME: It is harder to use META as a node name since this is a generic // placeholder for a whole bunch of things each of which has its own // newline constraint requirements. So, for now, I am skipping that // can of worms to prevent confusing the serializer with an overloaded // tag name. if ( DOMCompat::nodeName( $firstNode ) === 'meta' ) { return null; } // FIXME spec-compliant values would be upper-case, this is just a workaround // for current PHP DOM implementation and could be removed in the future // See discussion in the method comment above. $nodeName = mb_strtoupper( DOMCompat::nodeName( $firstNode ), "UTF-8" ); return !empty( $dp->stx ) ? $nodeName . '_' . $dp->stx : $nodeName; } /** * Encapsulation requires adding about attributes on the top-level * nodes of the range. This requires them to all be Elements. * * @param DOMRangeInfo $range */ private function ensureElementsInRange( DOMRangeInfo $range ): void { $n = $range->start; $e = $range->end; $about = DOMCompat::getAttribute( $range->startElem, 'about' ); while ( $n ) { $next = $n->nextSibling; if ( $n instanceof Element ) { $n->setAttribute( 'about', $about ); } elseif ( DOMUtils::isFosterablePosition( $n ) ) { // NOTE: There cannot be any non-IEW text in fosterable position // since the HTML tree builder would already have fostered it out. // So, any non-element node found here is safe to delete since: // (a) this has no rendering output impact, and // (b) data-mw captures template output => we don't need // to preserve this for html2wt either. Removing this // lets us preserve DOM range continuity. $n->parentNode->removeChild( $n ); } else { // Add a span wrapper to let us add about-ids to represent // the DOM range as a contiguous chain of DOM nodes. $span = $this->document->createElement( 'span' ); $span->setAttribute( 'about', $about ); $dp = new DataParsoid; $dp->setTempFlag( TempData::WRAPPER ); DOMDataUtils::setDataParsoid( $span, $dp ); $n->parentNode->replaceChild( $span, $n ); $span->appendChild( $n ); $n = $span; } if ( $n === $e ) { break; } $n = $next; } } /** * Find the first element to be encapsulated. * Skip past marker metas and non-elements (which will all be IEW * in fosterable positions in a table). * * @param DOMRangeInfo $range * @return Element */ private static function findEncapTarget( DOMRangeInfo $range ): Element { $encapTgt = $range->start; '@phan-var Node $encapTgt'; // Skip template-marker meta-tags. while ( WTUtils::isTplMarkerMeta( $encapTgt ) || !( $encapTgt instanceof Element ) ) { // Detect unwrappable template and bail out early. if ( $encapTgt === $range->end || ( !( $encapTgt instanceof Element ) && !DOMUtils::isFosterablePosition( $encapTgt ) ) ) { throw new Error( 'Cannot encapsulate transclusion. Start=' . DOMCompat::getOuterHTML( $range->startElem ) ); } $encapTgt = $encapTgt->nextSibling; } '@phan-var Element $encapTgt'; return $encapTgt; } /** * Add markers to the DOM around the non-overlapping ranges. * * @param DOMRangeInfo[] $nonOverlappingRanges */ private function encapsulateTemplates( array $nonOverlappingRanges ): void { foreach ( $nonOverlappingRanges as $i => $range ) { // We should never have flipped overlapping ranges, and indeed that's // asserted in `findTopLevelNonOverlappingRanges`. Flipping results // in either completely nested ranges, or non-intersecting ranges. // // If the table causing the fostering is not transcluded, we emit a // foster box and wrap the whole table+fb in metas, producing nested // ranges. For ex, // // <table> // {{1x|<div>}} // // The tricky part is when the table *is* transcluded, and we omit the // foster box. The common case (for some definition of common) might // be like, // // {{1x|<table>}} // {{1x|<div>}} // // Here, #mwt1 leaves a table open and the end meta from #mwt2 is // fostered, since it gets closed into the div. The range for #mwt1 // is the entire table, which thankfully contains #mwt2, so we still // have the expected entire nesting. Any tricks to extend the range // of #mwt2 beyond the table (so that we have an overlapping range) will // inevitably result in the end meta not being fostered, and we avoid // this situation altogether. // // The very edgy case is as follows, // // {{1x|<table><div>}}</div> // {{1x|<div>}} // // where both end metas are fostered. Ignoring that we don't even // roundtrip the first transclusion properly on its own, here we have // a flipped range where, since the end meta for the first range was // also fostered, the ranges still don't overlap. // FIXME: The code below needs to be aware of flipped ranges. $this->ensureElementsInRange( $range ); $tplArray = $this->compoundTpls[$range->id] ?? null; Assert::invariant( (bool)$tplArray, 'No parts for template range!' ); $encapTgt = self::findEncapTarget( $range ); $encapValid = false; $encapDP = DOMDataUtils::getDataParsoid( $encapTgt ); // Update type-of (always even if tpl-encap below will fail). // This ensures that VE will still "edit-protect" this template // and not allow its content to be edited directly. $startElem = $range->startElem; if ( $startElem !== $encapTgt ) { $t1 = DOMCompat::getAttribute( $startElem, 'typeof' ); if ( $t1 !== null ) { foreach ( array_reverse( explode( ' ', $t1 ) ) as $t ) { DOMUtils::addTypeOf( $encapTgt, $t, true ); } } } /* ---------------------------------------------------------------- * We'll attempt to update dp1.dsr to reflect the entire range of * the template. This relies on a couple observations: * * 1. In the common case, dp2.dsr->end will be > dp1.dsr->end * If so, new range = dp1.dsr->start, dp2.dsr->end * * 2. But, foster parenting can complicate this when range.end is a table * and range.start has been fostered out of the table (range.end). * But, we need to verify this assumption. * * 2a. If dp2.dsr->start is smaller than dp1.dsr->start, this is a * confirmed case of range.start being fostered out of range.end. * * 2b. If dp2.dsr->start is unknown, we rely on fostered flag on * range.start, if any. * ---------------------------------------------------------------- */ $dp1 = DOMDataUtils::getDataParsoid( $range->start ); $dp1DSR = isset( $dp1->dsr ) ? clone $dp1->dsr : null; $dp2DSR = self::getRangeEndDSR( $range ); if ( $dp1DSR ) { if ( $dp2DSR ) { // Case 1. above if ( $dp2DSR->end > $dp1DSR->end ) { $dp1DSR->end = $dp2DSR->end; } // Case 2. above $endDsr = $dp2DSR->start; if ( DOMCompat::nodeName( $range->end ) === 'table' && $endDsr !== null && ( $endDsr < $dp1DSR->start || !empty( $dp1->fostered ) ) ) { $dp1DSR->start = $endDsr; } } // encapsulation possible only if dp1.dsr is valid $encapValid = Utils::isValidDSR( $dp1DSR ) && $dp1DSR->end >= $dp1DSR->start; } if ( $encapValid ) { // Find transclusion info from the array (skip past a wikitext element) /** @var CompoundTemplateInfo $firstTplInfo */ $firstTplInfo = is_string( $tplArray[0] ) ? $tplArray[1] : $tplArray[0]; // Add any leading wikitext if ( $firstTplInfo->dsr->start > $dp1DSR->start ) { // This gap in dsr (between the final encapsulated content, and the // content that actually came from a template) is indicative of this // being a mixed-template-content-block and/or multi-template-content-block // scenario. // // In this case, record the name of the first node in the encapsulated // content. During html -> wt serialization, newline constraints for // this entire block has to be determined relative to this node. $ftn = self::findFirstTemplatedNode( $range ); if ( $ftn !== null ) { $encapDP->firstWikitextNode = $ftn; } $width = $firstTplInfo->dsr->start - $dp1DSR->start; array_unshift( $tplArray, PHPUtils::safeSubstr( $this->frame->getSrcText(), $dp1DSR->start, $width ) ); } // Add any trailing wikitext /** @var CompoundTemplateInfo $lastTplInfo */ $lastTplInfo = PHPUtils::lastItem( $tplArray ); if ( $lastTplInfo->dsr->end < $dp1DSR->end ) { $width = $dp1DSR->end - $lastTplInfo->dsr->end; $tplArray[] = PHPUtils::safeSubstr( $this->frame->getSrcText(), $lastTplInfo->dsr->end, $width ); } // Map the array of { dsr: .. , args: .. } objects to just the args property $infoIndex = 0; $parts = []; $pi = []; foreach ( $tplArray as $a ) { if ( is_string( $a ) ) { $parts[] = $a; } elseif ( $a instanceof CompoundTemplateInfo ) { // Remember the position of the transclusion relative // to other transclusions. Should match the index of // the corresponding private metadata in $templateInfos. $a->info->i = $infoIndex++; $a->info->type = 'template'; if ( $a->isParam ) { $a->info->type = 'templatearg'; } elseif ( $a->info->func ) { $a->info->type = 'parserfunction'; } $parts[] = $a->info; // FIXME: we throw away the array keys and rebuild them // again in WikitextSerializer $pi[] = array_values( $a->info->paramInfos ); } } // Set up dsr->start, dsr->end, and data-mw on the target node // Avoid clobbering existing (ex: extension) data-mw information (T214241) $encapDataMw = DOMDataUtils::getDataMw( $encapTgt ); $encapDataMw->parts = $parts; DOMDataUtils::setDataMw( $encapTgt, $encapDataMw ); $encapDP->pi = $pi; // Special case when mixed-attribute-and-content templates are // involved. This information is reliable and comes from the // AttributeExpander and gets around the problem of unmarked // fostered content that findFirstTemplatedNode runs into. $firstWikitextNode = DOMDataUtils::getDataParsoid( $range->startElem )->firstWikitextNode ?? null; if ( empty( $encapDP->firstWikitextNode ) && $firstWikitextNode ) { $encapDP->firstWikitextNode = $firstWikitextNode; } } else { $errors = [ 'Do not have necessary info. to encapsulate Tpl: ' . $i ]; $errors[] = 'Start Elt : ' . DOMCompat::getOuterHTML( $startElem ); $errors[] = 'End Elt : ' . DOMCompat::getOuterHTML( $range->endElem ); $errors[] = 'Start DSR : ' . PHPUtils::jsonEncode( $dp1DSR ?? 'no-start-dsr' ); $errors[] = 'End DSR : ' . PHPUtils::jsonEncode( $dp2DSR ?? [] ); $this->env->log( 'error', implode( "\n", $errors ) ); } // Make DSR range zero-width for fostered templates after // setting up data-mw. However, since template encapsulation // sometimes captures both fostered content as well as the table // from which it was fostered from, in those scenarios, we should // leave DSR info untouched. // // SSS FIXME: // 1. Should we remove the fostered flag from the entire // encapsulated block if we dont set dsr width range to zero // since only part of the block is fostered, not the entire // encapsulated block? // // 2. In both cases, should we mark these uneditable by adding // mw:Placeholder to the typeof? if ( !empty( $dp1->fostered ) ) { $encapDataMw = DOMDataUtils::getDataMw( $encapTgt ); if ( !$encapDataMw || !$encapDataMw->parts || count( $encapDataMw->parts ) === 1 ) { $dp1DSR->end = $dp1DSR->start; } } // Update DSR after fostering-related fixes are done. if ( $encapValid ) { // encapInfo.dp points to DOMDataUtils.getDataParsoid(encapInfo.target) // and all updates below update properties in that object tree. if ( empty( $encapDP->dsr ) ) { $encapDP->dsr = $dp1DSR; } else { $encapDP->dsr->start = $dp1DSR->start; $encapDP->dsr->end = $dp1DSR->end; } $encapDP->src = $encapDP->dsr->substr( $this->frame->getSrcText() ); } // Remove startElem (=range.startElem) if a meta. If a meta, // it is guaranteed to be a marker meta added to mark the start // of the template. if ( WTUtils::isTplMarkerMeta( $startElem ) ) { $startElem->parentNode->removeChild( $startElem ); } $range->endElem->parentNode->removeChild( $range->endElem ); } } /** * Attach a range to a node. * * @param Element $node * @param DOMRangeInfo $range */ private function addNodeRange( Element $node, DOMRangeInfo $range ): void { // With the native DOM extension, normally you assume that DOMNode // objects are temporary -- you get a new DOMNode every time you // traverse the DOM. But by retaining a reference in the // SplObjectStorage, we ensure that the DOMNode object stays live while // the pass is active. Then its address can be used as an index. if ( !isset( $this->nodeRanges[$node] ) ) { // We have to use an object as the data because // SplObjectStorage::offsetGet() does not provide an lval. $this->nodeRanges[$node] = new DOMRangeInfoArray; } $this->nodeRanges[$node]->ranges[$range->id] = $range; } /** * Get the ranges attached to this node, indexed by range ID. * * @param Element $node * @return DOMRangeInfo[]|null */ private function getNodeRanges( Element $node ): ?array { return $this->nodeRanges[$node]->ranges ?? null; } /** * Recursively walk the DOM tree. Find wrappable template ranges and return them. * * @param Node $rootNode * @return DOMRangeInfo[] */ protected function findWrappableMetaRanges( Node $rootNode ): array { $tpls = []; $tplRanges = []; $this->findWrappableTemplateRangesRecursive( $rootNode, $tpls, $tplRanges ); return $tplRanges; } /** * Recursive helper for findWrappableTemplateRanges() * * @param Node $rootNode * @param ElementRange[] &$tpls Template start and end elements by ID * @param DOMRangeInfo[] &$tplRanges Template range info */ private function findWrappableTemplateRangesRecursive( Node $rootNode, array &$tpls, array &$tplRanges ): void { $elem = $rootNode->firstChild; while ( $elem ) { // get the next sibling before doing anything since // we may delete elem as part of encapsulation $nextSibling = $elem->nextSibling; if ( $elem instanceof Element ) { $metaType = $this->matchMetaType( $elem ); // Ignore templates without tsr. // // These are definitely nested in other templates / extensions // and need not be wrapped themselves since they // can never be edited directly. // // NOTE: We are only testing for tsr presence on the start-elem // because wikitext errors can lead to parse failures and no tsr // on end-meta-tags. // // Ex: "<ref>{{1x|bar}}<!--bad-></ref>" if ( $metaType !== null && ( !empty( DOMDataUtils::getDataParsoid( $elem )->tsr ) || str_ends_with( $metaType, '/End' ) ) ) { $about = $this->getRangeId( $elem ); $tpl = $tpls[$about] ?? null; // Is this a start marker? if ( !str_ends_with( $metaType, '/End' ) ) { if ( $tpl ) { $tpl->startElem = $elem; // content or end marker existed already if ( !empty( $tpl->endElem ) ) { // End marker was foster-parented. // Found actual start tag. $tplRanges[] = $this->getDOMRange( $elem, $tpl->endElem, $tpl->endElem ); } else { // should not happen! throw new UnreachableException( "start found after content for $about." ); } } else { $tpl = new ElementRange; $tpl->startElem = $elem; $tpls[$about] = $tpl; } } else { // elem is the end-meta tag if ( $tpl ) { /* ------------------------------------------------------------ * Special case: In some cases, the entire template content can * get fostered out of a table, not just the start/end marker. * * Simplest example: * * {| * {{1x|foo}} * |} * * More complex example: * * {| * {{1x| * a * b * * c * }} * |} * * Since meta-tags don't normally get fostered out, this scenario * only arises when the entire content including meta-tags was * wrapped in p-tags. So, we look to see if: * 1. the end-meta-tag's parent has a table sibling, * 2. the start meta's parent is marked as fostered. * If so, we recognize this as an adoption scenario and fix up * DSR of start-meta-tag's parent to include the table's DSR. * ------------------------------------------------------------*/ $sm = $tpl->startElem; // TODO: this should only happen in fairly specific cases of the // annotation processing and should eventually be handled properly. // In the meantime, we create and log an exception to have an idea // of the amplitude of the problem. if ( $sm === null ) { throw new RangeBuilderException( 'No start tag found for the range' ); } $em = $elem; $ee = $em; $tbl = $em->parentNode->nextSibling; // Dont get distracted by a newline node -- skip over it // Unsure why it shows up occasionally if ( $tbl && $tbl instanceof Text && $tbl->nodeValue === "\n" ) { $tbl = $tbl->nextSibling; } $dp = !DOMUtils::atTheTop( $sm->parentNode ) ? DOMDataUtils::getDataParsoid( $sm->parentNode ) : null; if ( $tbl && DOMCompat::nodeName( $tbl ) === 'table' && !empty( $dp->fostered ) ) { '@phan-var Element $tbl'; /** @var Element $tbl */ $tblDP = DOMDataUtils::getDataParsoid( $tbl ); if ( isset( $dp->tsr->start ) && $dp->tsr->start !== null && isset( $tblDP->dsr->start ) && $tblDP->dsr->start === null ) { $tblDP->dsr->start = $dp->tsr->start; } $tbl->setAttribute( 'about', $about ); // set about on elem $ee = $tbl; } $tplRanges[] = $this->getDOMRange( $sm, $em, $ee ); } else { // The end tag can appear before the start tag if it is fostered out // of the table and the start tag is not. // It can even technically happen that both tags are fostered out of // a table and that the range is flipped: while the fostered content of // single table is fostered in-order, the ordering might change // across tables if the tags are not initially fostered by the same // table. $tpl = new ElementRange; $tpl->endElem = $elem; $tpls[$about] = $tpl; } } } else { $this->findWrappableTemplateRangesRecursive( $elem, $tpls, $tplRanges ); } } $elem = $nextSibling; } } /** * Returns the meta type of the element if it exists and matches the type expected by the * current class, null otherwise * @param Element $elem the element to check * @return string|null */ protected function matchMetaType( Element $elem ): ?string { // for this class we're interested in the template type return WTUtils::matchTplType( $elem ); } protected function verifyTplInfoExpectation( ?TemplateInfo $templateInfo, TempData $tmp ): void { if ( !$templateInfo ) { // An assertion here is probably an indication that we're // mistakenly doing template wrapping in a nested context. Assert::invariant( $tmp->getFlag( TempData::FROM_FOSTER ), 'Template range without arginfo.' ); } } public function execute( Node $root ): void { $tplRanges = $this->findWrappableMetaRanges( $root ); if ( count( $tplRanges ) > 0 ) { $nonOverlappingRanges = $this->findTopLevelNonOverlappingRanges( $root, $tplRanges ); $this->encapsulateTemplates( $nonOverlappingRanges ); } } /** * Creates a range that encloses $startMeta and $endMeta * * @param Element $startMeta * @param Element $endMeta * @param ?Element $endElem * @return DOMRangeInfo */ protected function findEnclosingRange( Element $startMeta, Element $endMeta, ?Element $endElem = null ): DOMRangeInfo { $range = new DOMRangeInfo( Utils::stripParsoidIdPrefix( $this->getRangeId( $startMeta ) ), DOMDataUtils::getDataParsoid( $startMeta )->tsr->start, $startMeta, $endMeta ); // Find common ancestor of startMeta and endElem $startAncestors = DOMUtils::pathToRoot( $startMeta ); $elem = $endElem ?? $endMeta; $parentNode = $elem->parentNode; while ( $parentNode && $parentNode->nodeType !== XML_DOCUMENT_NODE ) { $i = array_search( $parentNode, $startAncestors, true ); if ( $i === 0 ) { throw new UnreachableException( 'The startMeta cannot be the common ancestor.' ); } elseif ( $i > 0 ) { $range->start = $startAncestors[$i - 1]; $range->end = $elem; break; } $elem = $parentNode; $parentNode = $elem->parentNode; } return $range; } } PK ! ��$ $ PWrap.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; use Wikimedia\Assert\UnreachableException; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\DOM\Comment; use Wikimedia\Parsoid\DOM\DocumentFragment; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\DOM\Text; use Wikimedia\Parsoid\NodeData\TempData; use Wikimedia\Parsoid\Utils\DOMCompat; use Wikimedia\Parsoid\Utils\DOMDataUtils; use Wikimedia\Parsoid\Utils\DOMUtils; use Wikimedia\Parsoid\Wt2Html\Wt2HtmlDOMProcessor; class PWrap implements Wt2HtmlDOMProcessor { /** * Flattens an array with other arrays for elements into * an array without nested arrays. * * @param array[] $a * @return array */ private function flatten( array $a ): array { return $a === [] ? [] : array_merge( ...$a ); } private static function pWrapOptionalChildren( Element $elt ): bool { foreach ( $elt->childNodes as $c ) { if ( !self::pWrapOptional( $c ) ) { return false; } } return true; } /** * Is a P-wrapper optional for this node? * * The following nodes do not need p wrappers of their own: * - whitespace nodes * - comment nodes * - HTML metadata tags generated by wikitext (not always rendering-transparent) * and these metatags don't need p-wrappers of their own. Both Remex and Parsoid * have identical p-wrapping behavior on these tags. This is a superset of * \\MediaWiki\Tidy\RemexCompatMunger::$metadataElements. * - parsoid-added span wrappers around pwrap-optional nodes * * @param Node $n * @return bool */ public static function pWrapOptional( Node $n ): bool { return $n instanceof Comment || ( $n instanceof Text && preg_match( '/^\s*$/D', $n->nodeValue ) ) || ( $n instanceof Element && ( DOMUtils::isMetaDataTag( $n ) || ( DOMDataUtils::getDataParsoid( $n )->getTempFlag( TempData::WRAPPER ) && self::pWrapOptionalChildren( $n ) ) ) ); } /** * Can we split the subtree rooted at $n into multiple adjacent * subtrees rooted in a clone of $n where each of those subtrees * get a contiguous subset of $n's children? * * This is probably equivalent to asking if this node supports the * adoption agency algorithm in the HTML5 spec. * * @param Node $n * @return bool */ private function isSplittableTag( Node $n ): bool { // Seems safe to split span, sub, sup, cite tags // // However, if we want to mimic Parsoid and HTML5 spec // precisely, we should only use isFormattingElt(n) return DOMUtils::isFormattingElt( $n ); } /** * Merge a contiguous run of split subtrees that have identical pwrap properties * * @param Element $n * @param array $a * @return array */ private function mergeRuns( Element $n, array $a ): array { $ret = []; // This flag should be transferred to the rightmost // clone of this node in the loop below. $ndp = DOMDataUtils::getDataParsoid( $n ); $origAIEnd = $ndp->autoInsertedEnd ?? null; $origEndTSR = $ndp->tmp->endTSR ?? null; $i = -1; foreach ( $a as $v ) { if ( $i < 0 ) { $ret[] = [ 'pwrap' => $v['pwrap'], 'node' => $n ]; $i++; } elseif ( $ret[$i]['pwrap'] === null ) { // @phan-suppress-previous-line PhanTypeInvalidDimOffset $ret[$i]['pwrap'] = $v['pwrap']; } elseif ( $ret[$i]['pwrap'] !== $v['pwrap'] && $v['pwrap'] !== null ) { // @phan-suppress-previous-line PhanTypeInvalidDimOffset // @phan-suppress-next-line PhanTypeInvalidDimOffset $dp = DOMDataUtils::getDataParsoid( $ret[$i]['node'] ); $dp->autoInsertedEnd = true; unset( $dp->tmp->endTSR ); $cnode = DOMDataUtils::cloneNode( $n, false ); $ret[] = [ 'pwrap' => $v['pwrap'], 'node' => $cnode ]; $i++; DOMDataUtils::getDataParsoid( $ret[$i]['node'] )->autoInsertedStart = true; } $ret[$i]['node']->appendChild( $v['node'] ); } if ( $i >= 0 ) { $dp = DOMDataUtils::getDataParsoid( $ret[$i]['node'] ); if ( $origAIEnd ) { $dp->autoInsertedEnd = true; unset( $dp->tmp->endTSR ); } else { unset( $dp->autoInsertedEnd ); if ( $origEndTSR ) { $dp->getTemp()->endTSR = $origEndTSR; } } } return $ret; } /** * Implements the split operation described in the algorithm below. * * The values of 'pwrap' here bear out in pWrapDOM below. * * true: opens a paragaph or continues adding to a paragraph * false: closes a paragraph * null: agnostic, doesn't open or close a paragraph * * @param Node $n * @return array */ private function split( Node $n ): array { if ( $this->pWrapOptional( $n ) ) { // Set 'pwrap' to null so p-wrapping doesn't break // a run of wrappable nodes because of these. return [ [ 'pwrap' => null, 'node' => $n ] ]; } elseif ( $n instanceof Text ) { return [ [ 'pwrap' => true, 'node' => $n ] ]; } elseif ( !$this->isSplittableTag( $n ) || count( $n->childNodes ) === 0 ) { // block tag OR non-splittable inline tag return [ [ 'pwrap' => !DOMUtils::hasBlockTag( $n ), 'node' => $n ] ]; } else { DOMUtils::assertElt( $n ); // splittable inline tag // split for each child and merge runs $children = $n->childNodes; $splits = []; foreach ( $children as $child ) { $splits[] = $this->split( $child ); } return $this->mergeRuns( $n, $this->flatten( $splits ) ); } } /** * Wrap children of '$root' with paragraph tags * so that the final output has the following properties: * * 1. A paragraph will have at least one non-whitespace text * node or an non-block element node in its subtree. * * 2. Two paragraph nodes aren't siblings of each other. * * 3. If a child of $root is not a paragraph node, it is one of: * - a white-space only text node * - a comment node * - a block element * - a splittable inline element which has some block node * on *all* paths from it to all leaves in its subtree. * - a non-splittable inline element which has some block node * on *some* path from it to a leaf in its subtree. * * This output is generated with the following algorithm * * 1. Block nodes are skipped over * 2. Non-splittable inline nodes that have a block tag * in its subtree are skipped over. * 3. A splittable inline node, I, that has at least one block tag * in its subtree is split into multiple tree such that * - each new tree is $rooted in I * - the trees alternate between two kinds * (a) it has no block node inside * => pwrap is true * (b) all paths from I to its leaves have some block node inside * => pwrap is false * 4. A paragraph tag is wrapped around adjacent runs of comment nodes, * text nodes, and an inline node that has no block node embedded inside. * This paragraph tag does not start with nodes for which p-wrapping is * optional (as determined by the pWrapOptional helper). The current * algorithm also ensures that it doesn't end with one of those either * (if it impacts template / param / annotation range building). * * @param Element|DocumentFragment $root */ private function pWrapDOM( Node $root ) { $state = new PWrapState(); $c = $root->firstChild; while ( $c ) { $next = $c->nextSibling; if ( DOMUtils::isRemexBlockNode( $c ) ) { $state->reset(); } else { $vs = $this->split( $c ); foreach ( $vs as $v ) { $n = $v['node']; if ( $v['pwrap'] === false ) { $state->reset(); $root->insertBefore( $n, $next ); } elseif ( $v['pwrap'] === null ) { if ( $state->p ) { $state->p->appendChild( $n ); $state->processOptionalNode( $n ); } else { $root->insertBefore( $n, $next ); } } elseif ( $v['pwrap'] === true ) { if ( !$state->p ) { $state->p = $root->ownerDocument->createElement( 'p' ); $root->insertBefore( $state->p, $next ); } $state->p->appendChild( $n ); } else { throw new UnreachableException( 'Unexpected value for pwrap.' ); } } } $c = $next; } $state->reset(); } /** * This function walks the DOM tree $rooted at '$root' * and uses pWrapDOM to add appropriate paragraph wrapper * tags around children of nodes with tag name '$tagName'. * * @param Element|DocumentFragment $root * @param string $tagName */ private function pWrapInsideTag( Node $root, string $tagName ) { $c = $root->firstChild; while ( $c ) { $next = $c->nextSibling; if ( $c instanceof Element ) { if ( DOMCompat::nodeName( $c ) === $tagName ) { $this->pWrapDOM( $c ); } else { $this->pWrapInsideTag( $c, $tagName ); } } $c = $next; } } /** * Wrap children of <body> as well as children of * <blockquote> found anywhere in the DOM tree. * * @inheritDoc */ public function run( Env $env, Node $root, array $options = [], bool $atTopLevel = false ): void { // No p-wrapping in an inline context! if ( !empty( $options['inlineContext'] ) ) { return; } '@phan-var Element|DocumentFragment $root'; // @var Element|DocumentFragment $root $this->pWrapDOM( $root ); $this->pWrapInsideTag( $root, 'blockquote' ); } } PK ! �,� � WrapSections.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\Wt2Html\Wt2HtmlDOMProcessor; class WrapSections implements Wt2HtmlDOMProcessor { /** * DOM Postprocessor entry function to walk DOM rooted at $root * and add <section> wrappers as necessary. * Implements the algorithm documented @ mw:Parsing/Notes/Section_Wrapping * * @inheritDoc */ public function run( Env $env, Node $root, array $options = [], bool $atTopLevel = false ): void { if ( !$env->getWrapSections() ) { return; } $state = new WrapSectionsState( $env, $options['frame'], $root ); $state->run(); } } PK ! �0��� � DOMRangeInfo.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\DOM\Node; class DOMRangeInfo { public string $id; public int $startOffset; /** * $startElem, $endElem are the start/end meta tags for a transclusion * $start, $end are the start/end DOM nodes after the range is * expanded, merged with other ranges, etc. In the simple cases, they will * be identical to $startElem, $endElem. */ public Element $startElem; public Element $endElem; public ?Node $start; public ?Node $end; /** * In foster-parenting situations, the end-meta tag can show up before the * start-meta. We record this info for later analysis. */ public bool $flipped = false; /** * A range is marked as extended when it is found to overlap with another * range during findTopLevelNonOverlappingRanges. */ public bool $extendedByOverlapMerge = false; public function __construct( string $id, int $startOffset, Element $startMeta, Element $endMeta ) { $this->id = $id; $this->startOffset = $startOffset; $this->startElem = $startMeta; $this->endElem = $endMeta; } } PK ! &�_�� � ProcessEmbeddedDocs.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; use Wikimedia\Parsoid\Utils\ContentUtils; use Wikimedia\Parsoid\Utils\DOMCompat; use Wikimedia\Parsoid\Utils\DOMDataUtils; use Wikimedia\Parsoid\Utils\PipelineUtils; use Wikimedia\Parsoid\Wt2Html\Wt2HtmlDOMProcessor; /** */ class ProcessEmbeddedDocs implements Wt2HtmlDOMProcessor { private Env $env; private ParsoidExtensionAPI $extApi; private function processNode( Element $elt ): void { ContentUtils::processAttributeEmbeddedHTML( $this->extApi, $elt, function ( string $html ) { $dom = ContentUtils::createDocument( $html ); $body = DOMCompat::getBody( $dom ); DOMDataUtils::visitAndLoadDataAttribs( $body ); PipelineUtils::processContentInPipeline( $this->env, $this->env->topFrame, $body, [ 'pipelineType' => 'fullparse-embedded-docs-dom-to-dom', 'pipelineOpts' => [], 'sol' => true ], ); return ContentUtils::ppToXML( $body, [ 'innerXML' => true ] ); } ); $child = $elt->firstChild; while ( $child ) { if ( $child instanceof Element ) { $this->processNode( $child ); } $child = $child->nextSibling; } } /** * DOM Postprocessor entry function to walk DOM rooted at $root * and convert the DSR offsets as needed. * @see ConvertUtils::convertOffsets * * @inheritDoc */ public function run( Env $env, Node $root, array $options = [], bool $atTopLevel = false ): void { $this->env = $env; $this->extApi = new ParsoidExtensionAPI( $env ); '@phan-var Element $root'; $this->processNode( $root ); } } PK ! �cjE]� ]� Linter.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; use stdClass; use Wikimedia\Assert\UnreachableException; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\Core\DomSourceRange; use Wikimedia\Parsoid\DOM\Comment; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\DOM\Text; use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; use Wikimedia\Parsoid\NodeData\DataParsoid; use Wikimedia\Parsoid\NodeData\TempData; use Wikimedia\Parsoid\NodeData\TemplateInfo; use Wikimedia\Parsoid\Utils\DiffDOMUtils; use Wikimedia\Parsoid\Utils\DOMCompat; use Wikimedia\Parsoid\Utils\DOMDataUtils; use Wikimedia\Parsoid\Utils\DOMUtils; use Wikimedia\Parsoid\Utils\PHPUtils; use Wikimedia\Parsoid\Utils\Timing; use Wikimedia\Parsoid\Utils\Utils; use Wikimedia\Parsoid\Utils\WTUtils; use Wikimedia\Parsoid\Wikitext\Consts; use Wikimedia\Parsoid\Wt2Html\Wt2HtmlDOMProcessor; /** * DOM pass that walks the DOM tree, detects specific wikitext patterns, * and emits them as linter events. */ class Linter implements Wt2HtmlDOMProcessor { private ?ParsoidExtensionAPI $extApi = null; private ?string $obsoleteTagsRE = null; private array $seenIds = []; /** @var array<string,bool>|null */ private ?array $tagsWithChangedMisnestingBehavior = null; /** * We are trying to find HTML5 tags that have different behavior compared to HTML4 * in some misnesting scenarios around wikitext paragraphs. * * Ex: Input: <p><small>a</p><p>b</small></p> * Tidy output: <p><small>a</small></p><p><small>b</small></p> * HTML5 output: <p><small>a</small></p><p><small>b</small></p> * * So, all good here. * But, see how output changes when we use <span> instead * * Ex: Input: <p><span>a</p><p>b</span></p> * Tidy output: <p><span>a</span></p><p><span>b</span></p> * HTML5 output: <p><span>a</span></p><p>b</p> * * The source wikitext is "<span>a\n\nb</span>". The difference persists even * when you have "<span>a\n\n<div>b</div>" or "<span>a\n\n{|\n|x\n|}\nbar". * * This is because Tidy seems to be doing the equivalent of HTM5-treebuilder's * active formatting element reconstruction step on all *inline* elements. * However, HTML5 parsers only do that on formatting elements. So, we need * to compute which HTML5 tags are subject to this differential behavior. * * We compute that by excluding the following tags from the list of all HTML5 tags * - If our sanitizer doesn't allow them, they will be escaped => ignore them * - HTML4 block tags are excluded (obviously) * - Void tags don't matter since they cannot wrap anything (obviously) * - Active formatting elements have special handling in the HTML5 tree building * algorithm where they are reconstructed to wrap all originally intended content. * (ex: <small> above) * * Here is the list of 22 HTML5 tags that are affected: * ABBR, BDI, BDO, CITE, DATA, DEL, DFN, INS, KBD, MARK, * Q, RB, RP, RT, RTC, RUBY, SAMP, SPAN, SUB, SUP, TIME, VAR * * https://phabricator.wikimedia.org/T176363#3628173 verifies that this list of * tags all demonstrate this behavior. * * @return array * @phan-return array<string,bool> */ private function getTagsWithChangedMisnestingBehavior(): array { if ( $this->tagsWithChangedMisnestingBehavior === null ) { // This set is frozen in time. It gets us down to the requisite // 22 HTML5 tags above, but shouldn't be used for anything other // than that. $HTML4TidyBlockTags = PHPUtils::makeSet( [ 'div', 'p', # tables 'table', 'tbody', 'thead', 'tfoot', 'caption', 'th', 'tr', 'td', # lists 'ul', 'ol', 'li', 'dl', 'dt', 'dd', # HTML5 heading content 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hgroup', # HTML5 sectioning content 'article', 'aside', 'nav', 'section', 'footer', 'header', 'figure', 'figcaption', 'fieldset', 'details', 'blockquote', # other 'hr', 'button', 'canvas', 'center', 'col', 'colgroup', 'embed', 'map', 'object', 'pre', 'progress', 'video', ] ); $this->tagsWithChangedMisnestingBehavior = []; foreach ( Consts::$HTML['HTML5Tags'] as $tag => $dummy ) { if ( isset( Consts::$Sanitizer['AllowedLiteralTags'][$tag] ) && !isset( $HTML4TidyBlockTags[$tag] ) && !isset( Consts::$HTML['FormattingTags'][$tag] ) && !isset( Consts::$HTML['VoidTags'][$tag] ) ) { $this->tagsWithChangedMisnestingBehavior[$tag] = true; } } } return $this->tagsWithChangedMisnestingBehavior; } /** * Finds a matching node at the "start" of this node. */ private function leftMostMisnestedDescendent( ?Node $node, Element $match ): ?Element { if ( !$node instanceof Element ) { return null; } if ( DOMUtils::isMarkerMeta( $node, 'mw:Placeholder/StrippedTag' ) ) { $name = DOMDataUtils::getDataParsoid( $node )->name ?? null; return $name === DOMCompat::nodeName( $match ) ? $node : null; } if ( DOMCompat::nodeName( $node ) === DOMCompat::nodeName( $match ) ) { $dp = DOMDataUtils::getDataParsoid( $node ); if ( ( DOMDataUtils::getDataParsoid( $match )->stx ?? null ) === ( $dp->stx ?? null ) && !empty( $dp->autoInsertedStart ) ) { if ( !empty( $dp->autoInsertedEnd ) ) { return $this->getMatchingMisnestedNode( $node, $match ); } else { return $node; } } } return $this->leftMostMisnestedDescendent( $node->firstChild, $match ); } /** * $node has an 'autoInsertedEnd' flag set on it. We are looking for * its matching node that has an 'autoInsertedStart' flag set on it. * This happens when the tree-builder fixes up misnested tags. * This "adjacency" is wrt the HTML string. In a DOM, this can either * be the next sibling OR, it might be the left-most-descendent of * of $node's parent's sibling (and so on up the ancestor chain). */ private function getMatchingMisnestedNode( Node $node, Element $match ): ?Element { if ( DOMUtils::atTheTop( $node ) ) { return null; } if ( DiffDOMUtils::nextNonSepSibling( $node ) ) { return $this->leftMostMisnestedDescendent( DiffDOMUtils::nextNonSepSibling( $node ), $match ); } return $this->getMatchingMisnestedNode( $node->parentNode, $match ); } /** * Given a tplInfo object, determine whether we are: * - Not processing template content (could be extension or top level page) * - Processing encapsulated content that is produced by a single template. * If so, return the name of that template. * - Processing encapsulated content that comes from multiple templates. * If so, return a flag indicating this. * * FIXME: We might potentially be computing this information redundantly * for every lint we find within this template's content. It could probably * be cached in tplInfo after it is computed once. */ public static function findEnclosingTemplateName( Env $env, ?stdClass $tplInfo ): ?array { if ( !$tplInfo ) { return null; } if ( !DOMUtils::hasTypeOf( $tplInfo->first, 'mw:Transclusion' ) ) { return null; } $dmw = DOMDataUtils::getDataMw( $tplInfo->first ); // This count check is conservative in that link suffixes and prefixes // could artifically add an extra element to the parts array but we // don't have a good way of distinguishing that right now. It will require // a non-string representation for them and a change in spec along with // a version bump and all that song and dance. If linting accuracy in these // scenarios become a problem, we can revisit this. if ( !empty( $dmw->parts ) && count( $dmw->parts ) === 1 ) { $p0 = $dmw->parts[0]; if ( !( $p0 instanceof TemplateInfo ) ) { throw new UnreachableException( "a single part will always be a TemplateInfo not a string" ); } $name = null; if ( !empty( $p0->href ) ) { // Could be "function" // PORT-FIXME: Should that be SiteConfig::relativeLinkPrefix() rather than './'? $name = PHPUtils::stripPrefix( $p0->href, './' ); } else { // type === 'templatearg' or 'template' $name = trim( $p0->targetWt ); } return [ 'name' => $name ]; } else { return [ 'multiPartTemplateBlock' => true ]; } } /** * Compute the DSR information for the lint object. * - In the common case, this is simply the DSR value of the node * that generated the lint. But, occasionally, for some lints, * we might have to post-process the node's DSR. * - If the lint is found in template content, then the DSR spans * the transclusion markup in the toplevel page source. */ public static function findLintDSR( ?array $tplLintInfo, ?stdClass $tplInfo, ?DomSourceRange $nodeDSR, ?callable $updateNodeDSR = null ): ?DomSourceRange { if ( $tplLintInfo !== null || ( $tplInfo && !Utils::isValidDSR( $nodeDSR ) ) ) { return DOMDataUtils::getDataParsoid( $tplInfo->first )->dsr ?? null; } else { return $updateNodeDSR ? $updateNodeDSR( $nodeDSR ) : $nodeDSR; } } /** * Determine if a node has an identical nested tag (?) */ private function hasIdenticalNestedTag( Element $node, string $name ): bool { $c = $node->firstChild; while ( $c ) { if ( $c instanceof Element ) { if ( DOMCompat::nodeName( $c ) === $name && empty( DOMDataUtils::getDataParsoid( $c )->autoInsertedEnd ) ) { return true; } return $this->hasIdenticalNestedTag( $c, $name ); } $c = $c->nextSibling; } return false; } /** * Determine if a node has misnestable content */ private function hasMisnestableContent( Node $node, string $name ): bool { // For A, TD, TH, H* tags, Tidy doesn't seem to propagate // the unclosed tag outside these tags. // No need to check for tr/table since content cannot show up there if ( DOMUtils::atTheTop( $node ) || preg_match( '/^(?:a|td|th|h\d)$/D', DOMCompat::nodeName( $node ) ) ) { return false; } $next = DiffDOMUtils::nextNonSepSibling( $node ); if ( !$next ) { return $this->hasMisnestableContent( $node->parentNode, $name ); } $contentNode = null; if ( DOMCompat::nodeName( $next ) === 'p' && !WTUtils::isLiteralHTMLNode( $next ) ) { $contentNode = DiffDOMUtils::firstNonSepChild( $next ); } else { $contentNode = $next; } // If the first "content" node we find is a matching // stripped tag, we have nothing that can get misnested return $contentNode && !( $contentNode instanceof Element && DOMUtils::isMarkerMeta( $contentNode, 'mw:Placeholder/StrippedTag' ) && isset( DOMDataUtils::getDataParsoid( $contentNode )->name ) && DOMDataUtils::getDataParsoid( $contentNode )->name === $name ); } /** * Indicate whether an end tag is optional for this node * * See https://www.w3.org/TR/html5/syntax.html#optional-tags * * End tags for tr/td/th/li are entirely optional since they * require a parent container and can only be followed by like * kind. * * Caveat: <li>foo</li><ol>..</ol> and <li>foo<ol>..</ol> * generate different DOM trees, so explicit </li> tag * is required to specify which of the two was intended. * * With that one caveat around nesting, the parse with/without * the end tag is identical. For now, ignoring that caveat * since they aren't like to show up in our corpus much. * * For the other tags in that w3c spec section, I haven't reasoned * through when exactly they are optional. Not handling that complexity * for now since those are likely uncommon use cases in our corpus. */ private function endTagOptional( Node $node ): bool { static $tagNames = [ 'tr', 'td', 'th', 'li' ]; return in_array( DOMCompat::nodeName( $node ), $tagNames, true ); } /** * Find the nearest ancestor heading tag */ private function getHeadingAncestor( Node $node ): ?Node { while ( $node && !DOMUtils::isHeading( $node ) ) { $node = $node->parentNode; } return $node; } /** * For formatting tags, Tidy seems to be doing this "smart" fixup of * unclosed tags by looking for matching unclosed pairs of identical tags * and if the content ends in non-whitespace text, it treats the second * unclosed opening tag as a closing tag. But, a HTML5 parser won't do this. * So, detect this pattern and flag for linter fixup. */ private function matchedOpenTagPairExists( Node $c, DataParsoid $dp ): bool { $lc = $c->lastChild; if ( !$lc instanceof Element || DOMCompat::nodeName( $lc ) !== DOMCompat::nodeName( $c ) ) { return false; } $lcDP = DOMDataUtils::getDataParsoid( $lc ); if ( empty( $lcDP->autoInsertedEnd ) || ( $lcDP->stx ?? null ) !== ( $dp->stx ?? null ) ) { return false; } $prev = $lc->previousSibling; // PORT-FIXME: Do we care about non-ASCII whitespace here? if ( $prev instanceof Text && !preg_match( '/\s$/D', $prev->nodeValue ) ) { return true; } return false; } /** * Lint Treebuilder fixups marked by ProcessTreeBuilderFixups * * It handles the following scenarios: * * 1. Unclosed end tags (`missing-end-tag`, `missing-end-tag-in-heading`) * 2. Invalid self-closed tags (`self-closed-tag`) * 3. Stripped tags (`stripped-tag`) * * In addition, we have specialized categories for some patterns * where we encounter unclosed end tags. * * 4. misnested-tag * 5. html5-misnesting * 6. multiple-unclosed-formatting-tags * 7. unclosed-quotes-in-heading */ private function lintTreeBuilderFixup( Env $env, Element $c, DataParsoid $dp, ?stdClass $tplInfo ): void { // This might have been processed as part of // misnested-tag category identification. if ( $dp->getTempFlag( TempData::LINTED ) ) { return; } $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); // During DSR computation, stripped meta tags // surrender their width to its previous sibling. // We record the original DSR in the tmp attribute // for that reason. $dsr = self::findLintDSR( $tplLintInfo, $tplInfo, $dp->tmp->origDSR ?? $dp->dsr ?? null ); $lintObj = null; if ( DOMUtils::isMarkerMeta( $c, 'mw:Placeholder/StrippedTag' ) ) { $lintObj = [ 'dsr' => $dsr, 'templateInfo' => $tplLintInfo, 'params' => [ 'name' => $dp->name ?? null ], ]; $env->recordLint( 'stripped-tag', $lintObj ); } // Dont bother linting for auto-inserted start/end or self-closing-tag if: // 1. c is a void element // Void elements won't have auto-inserted start/end tags // and self-closing versions are valid for them. // // 2. c is tbody (FIXME: don't remember why we have this exception) // // 3. c is not an HTML element (unless they are i/b quotes or tables) // // 4. c doesn't have DSR info and doesn't come from a template either $cNodeName = DOMCompat::nodeName( $c ); $ancestor = null; $isHtmlElement = WTUtils::hasLiteralHTMLMarker( $dp ); if ( !Utils::isVoidElement( $cNodeName ) && $cNodeName !== 'tbody' && ( $isHtmlElement || DOMUtils::isQuoteElt( $c ) || $cNodeName === 'table' ) && ( $tplInfo !== null || $dsr !== null ) ) { if ( !empty( $dp->selfClose ) && $cNodeName !== 'meta' ) { $lintObj = [ 'dsr' => $dsr, 'templateInfo' => $tplLintInfo, 'params' => [ 'name' => $cNodeName ], ]; $env->recordLint( 'self-closed-tag', $lintObj ); // The other checks won't pass - no need to test them. return; } if ( ( $dp->autoInsertedEnd ?? null ) === true && ( $tplInfo || ( $dsr->openWidth ?? 0 ) > 0 ) ) { $lintObj = [ 'dsr' => $dsr, 'templateInfo' => $tplLintInfo, 'params' => [ 'name' => $cNodeName ], ]; // FIXME: This literal html marker check is strictly not required // (a) we've already checked that above and know that isQuoteElt is // not one of our tags. // (b) none of the tags in the list have native wikitext syntax => // they will show up as literal html tags. // But, in the interest of long-term maintenance in the face of // changes (to wikitext or html specs), let us make it explicit. if ( $isHtmlElement && isset( $this->getTagsWithChangedMisnestingBehavior()[DOMCompat::nodeName( $c )] ) && $this->hasMisnestableContent( $c, DOMCompat::nodeName( $c ) ) && // Tidy WTF moment here! // I don't know why Tidy does something very different // when there is an identical nested tag here. // // <p><span id='1'>a<span>X</span></p><p>b</span></p> // vs. // <p><span id='1'>a</p><p>b</span></p> OR // <p><span id='1'>a<del>X</del></p><p>b</span></p> // // For the first snippet, Tidy only wraps "a" with the id='1' span // For the second and third snippets, Tidy wraps "b" with the id='1' span as well. // // For the corresponding wikitext that generates the above token stream, // Parsoid (and Remex) won't wrap 'b' with the id=1' span at all. !$this->hasIdenticalNestedTag( $c, DOMCompat::nodeName( $c ) ) ) { $env->recordLint( 'html5-misnesting', $lintObj ); } elseif ( !$isHtmlElement && DOMUtils::isQuoteElt( $c ) && // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.Found ( $ancestor = $this->getHeadingAncestor( $c->parentNode ) ) ) { $lintObj['params']['ancestorName'] = DOMCompat::nodeName( $ancestor ); $env->recordLint( 'unclosed-quotes-in-heading', $lintObj ); } else { $adjNode = $this->getMatchingMisnestedNode( $c, $c ); if ( $adjNode ) { $adjDp = DOMDataUtils::getDataParsoid( $adjNode ); $adjDp->setTempFlag( TempData::LINTED ); $env->recordLint( 'misnested-tag', $lintObj ); } elseif ( !$this->endTagOptional( $c ) && empty( $dp->autoInsertedStart ) ) { $lintObj['params']['inTable'] = DOMUtils::hasNameOrHasAncestorOfName( $c, 'table' ); $category = $this->getHeadingAncestor( $c ) ? 'missing-end-tag-in-heading' : 'missing-end-tag'; $next = DiffDOMUtils::nextNonSepSibling( $c ); if ( // Skip if covered by deletable-table-tag !( $cNodeName === 'table' && $next && ( DOMCompat::nodeName( $c ) === 'table' ) ) ) { $env->recordLint( $category, $lintObj ); } if ( isset( Consts::$HTML['FormattingTags'][DOMCompat::nodeName( $c )] ) && $this->matchedOpenTagPairExists( $c, $dp ) ) { $env->recordLint( 'multiple-unclosed-formatting-tags', $lintObj ); } } } } } } /** * Lint fostered content marked by MarkFosteredContent. * * Lint category: `fostered`, `fostered-transparent` * * This will log cases like: * * {| * foo * |- * | bar * |} * * Here 'foo' gets fostered out. */ private function lintFostered( Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo ): void { if ( DOMCompat::nodeName( $node ) !== 'table' ) { return; } // The top-level nodes in the foster box are span/p wrapped // and so, if we have fostered content, previous siblings to // the table are expected to be elements. $maybeFostered = $node->previousSibling; // Emit "fostered" or "fostered-transparent" depending on if the fostered // content is entirely transparent or not. // // We're trying to find a balance between creating noise for wikignomes // and avoiding dirty-diffs from DiscussionTools. DiscussionTools // expects to know when pages have fostered content otherwise it can // lead to corruption on edit. However, rendering transparent nodes // often end up in fosterable positions, like category links from // templates or include directives on template pages. $fosteredRenderingTransparent = false; while ( $maybeFostered instanceof Element && !empty( DOMDataUtils::getDataParsoid( $maybeFostered )->fostered ) && ( WTUtils::isRenderingTransparentNode( $maybeFostered ) || // TODO: Section tags are rendering transparent but not sol transparent, // and that method only considers WTUtils::isSolTransparentLink, though // there is a FIXME to consider all link nodes. ( DOMCompat::nodeName( $maybeFostered ) === 'link' && DOMUtils::hasTypeOf( $maybeFostered, 'mw:Extension/section' ) ) ) ) { // Skip rendering-transparent nodes if they come from a template, // since they'll roundtrip cleanly regardless $fosteredRenderingTransparent = $fosteredRenderingTransparent || !$tplInfo; $maybeFostered = $maybeFostered->previousSibling; } if ( $maybeFostered instanceof Element && !empty( DOMDataUtils::getDataParsoid( $maybeFostered )->fostered ) ) { $type = 'fostered'; } elseif ( $fosteredRenderingTransparent ) { $type = 'fostered-transparent'; } else { return; } $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); $lintObj = [ 'dsr' => self::findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), 'templateInfo' => $tplLintInfo, ]; $env->recordLint( $type, $lintObj ); } /** * Lint obsolete HTML tags. * * Lint category: `obsolete-tag`, `tidy-font-bug` */ private function lintObsoleteTag( Env $env, Element $c, DataParsoid $dp, ?stdClass $tplInfo ): void { if ( !$this->obsoleteTagsRE ) { $elts = []; foreach ( Consts::$HTML['OlderHTMLTags'] as $tag => $dummy ) { // Looks like all existing editors let editors add the <big> tag. // VE has a button to add <big>, it seems so does the WikiEditor // and JS wikitext editor. So, don't flag BIG as an obsolete tag. if ( $tag !== 'big' ) { $elts[] = preg_quote( $tag, '/' ); } } $this->obsoleteTagsRE = '/^(?:' . implode( '|', $elts ) . ')$/D'; } $tplLintInfo = null; if ( ( empty( $dp->autoInsertedStart ) || empty( $dp->autoInsertedEnd ) ) && preg_match( $this->obsoleteTagsRE, DOMCompat::nodeName( $c ) ) ) { $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); $lintObj = [ 'dsr' => self::findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), 'templateInfo' => $tplLintInfo, 'params' => [ 'name' => DOMCompat::nodeName( $c ) ], ]; $env->recordLint( 'obsolete-tag', $lintObj ); } if ( DOMCompat::nodeName( $c ) === 'font' && $c->hasAttribute( 'color' ) ) { /* ---------------------------------------------------------- * Tidy migrates <font> into the link in these cases * <font>[[Foo]]</font> * <font>[[Foo]]l</font> (link-trail) * <font><!--boo-->[[Foo]]</font> * <font>__NOTOC__[[Foo]]</font> * <font>[[Category:Foo]][[Foo]]</font> * <font>{{1x|[[Foo]]}}</font> * * Tidy does not migrate <font> into the link in these cases * <font> [[Foo]]</font> * <font>[[Foo]] </font> * <font>[[Foo]]L</font> (not a link-trail) * <font>[[Foo]][[Bar]]</font> * <font>[[Foo]][[Bar]]</font> * * <font> is special. * This behavior is not seen with other formatting tags. * * Remex/parsoid won't do any of this. * This difference in behavior only matters when the font tag * specifies a link colour because the link no longer renders * as blue/red but in the font-specified colour. * ---------------------------------------------------------- */ $tidyFontBug = $c->firstChild !== null; $haveLink = false; for ( $n = $c->firstChild; $n; $n = $n->nextSibling ) { $nodeName = DOMCompat::nodeName( $n ); if ( $nodeName !== 'a' && !WTUtils::isRenderingTransparentNode( $n ) && !WTUtils::isTplMarkerMeta( $n ) ) { $tidyFontBug = false; break; } if ( $nodeName === 'a' || $nodeName === 'figure' ) { if ( !$haveLink ) { $haveLink = true; } else { $tidyFontBug = false; break; } } } if ( $tidyFontBug ) { $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); $env->recordLint( 'tidy-font-bug', [ 'dsr' => self::findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), 'templateInfo' => $tplLintInfo, 'params' => [ 'name' => 'font' ] ] ); } } } /** * Log bogus (=unrecognized) media options. * * See - https://www.mediawiki.org/wiki/Help:Images#Syntax * * Lint category: `bogus-image-options` */ private function lintBogusImageOptions( Env $env, Node $c, DataParsoid $dp, ?stdClass $tplInfo ): void { // Despite the lint category name, this checks all media, not just images if ( WTUtils::isGeneratedFigure( $c ) && !empty( $dp->optList ) ) { $items = []; $bogusPx = $dp->getTempFlag( TempData::BOGUS_PX ); foreach ( $dp->optList as $item ) { if ( $item['ck'] === 'bogus' || ( $bogusPx && $item['ck'] === 'width' ) ) { $items[] = $item['ak']; } } if ( $items ) { $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); $env->recordLint( 'bogus-image-options', [ 'dsr' => self::findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), 'templateInfo' => $tplLintInfo, 'params' => [ 'items' => $items ] ] ); } } } /** * Lint tables Tidy deletes. * * Lint category: `deletable-table-tag` * * In this example below, the second table is in a fosterable position * (inside a <tr>). The tree builder closes the first table at that point * and starts a new table there. We are detecting this pattern because * Tidy does something very different here. It strips the inner table * and retains the outer table. So, for preserving rendering of pages * that are tailored for Tidy, editors have to fix up this wikitext * to strip the inner table (to mimic what Tidy does). * * {| style='border:1px solid red;' * |a * |- * {| style='border:1px solid blue;' * |b * |c * |} * |} */ private function lintDeletableTableTag( Env $env, Node $c, DataParsoid $dp, ?stdClass $tplInfo ): void { if ( DOMCompat::nodeName( $c ) === 'table' ) { $prev = DiffDOMUtils::previousNonSepSibling( $c ); if ( $prev instanceof Element && DOMCompat::nodeName( $prev ) === 'table' && !empty( DOMDataUtils::getDataParsoid( $prev )->autoInsertedEnd ) ) { $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); $dsr = self::findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null, static function ( ?DomSourceRange $nodeDSR ): ?DomSourceRange { // Identify the dsr-span of the opening tag // of the table that needs to be deleted $x = $nodeDSR === null ? null : ( clone $nodeDSR ); if ( !empty( $x->openWidth ) ) { $x->end = $x->innerStart(); $x->openWidth = 0; $x->closeWidth = 0; } return $x; } ); $lintObj = [ 'dsr' => $dsr, 'templateInfo' => $tplLintInfo, 'params' => [ 'name' => 'table' ], ]; $env->recordLint( 'deletable-table-tag', $lintObj ); } } } /** * Find the first child passing the filter. */ private function findMatchingChild( Node $node, callable $filter ): ?Node { $c = $node->firstChild; while ( $c && !$filter( $c ) ) { $c = $c->nextSibling; } return $c; } /** * Test if the node has a 'nowrap' CSS rule * * In the general case, this CSS can come from a class, * or from a <style> tag or a stylesheet or even from JS code. * But, for now, we are restricting this inspection to inline CSS * since the intent is to aid editors in fixing patterns that * can be automatically detected. * * Special case for enwiki that has Template:nowrap which * assigns class='nowrap' with CSS white-space:nowrap in * MediaWiki:Common.css */ private function hasNoWrapCSS( Node $node ): bool { return $node instanceof Element && ( str_contains( DOMCompat::getAttribute( $node, 'style' ) ?? '', 'nowrap' ) || DOMUtils::hasClass( $node, 'nowrap' ) ); } /** * Lint bad P wrapping. * * Lint category: `pwrap-bug-workaround` */ private function lintPWrapBugWorkaround( Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo ): void { if ( !DOMUtils::isWikitextBlockNode( $node ) && DOMUtils::isWikitextBlockNode( $node->parentNode ) && $this->hasNoWrapCSS( $node ) ) { $p = $this->findMatchingChild( $node, static function ( $e ) { return DOMCompat::nodeName( $e ) === 'p'; } ); if ( $p ) { $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); $lintObj = [ 'dsr' => self::findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), 'templateInfo' => $tplLintInfo, 'params' => [ 'root' => DOMCompat::nodeName( $node->parentNode ), 'child' => DOMCompat::nodeName( $node ), ] ]; $env->recordLint( 'pwrap-bug-workaround', $lintObj ); } } } /** * Lint Tidy div span flip. * * Lint category: `misc-tidy-replacement-issues` */ private function lintMiscTidyReplacementIssues( Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo ): void { if ( DOMCompat::nodeName( $node ) !== 'span' ) { return; } $fc = DiffDOMUtils::firstNonSepChild( $node ); if ( !$fc instanceof Element || DOMCompat::nodeName( $fc ) !== 'div' ) { return; } // No style/class attributes -- so, this won't affect rendering if ( !$node->hasAttribute( 'class' ) && !$node->hasAttribute( 'style' ) && !$fc->hasAttribute( 'class' ) && !$fc->hasAttribute( 'style' ) ) { return; } $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); $lintObj = [ 'dsr' => self::findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), 'templateInfo' => $tplLintInfo, 'params' => [ 'subtype' => 'div-span-flip' ] ]; $env->recordLint( 'misc-tidy-replacement-issues', $lintObj ); } /** * Lint tidy whitespace bug. * * Lint category: `tidy-whitespace-bug` */ private function lintTidyWhitespaceBug( Env $env, Node $node, DataParsoid $dp, ?stdClass $tplInfo ): void { // We handle a run of nodes in one shot. // No need to reprocess repeatedly. if ( $dp->getTempFlag( TempData::PROCESSED_TIDY_WS_BUG ) ) { return; } // Find the longest run of nodes that are affected by white-space:nowrap CSS // in a way that leads to unsightly rendering in HTML5 compliant browsers. // // Check if Tidy does buggy whitespace hoisting there to provide the browser // opportunities to split the content in short segments. // // If so, editors would need to edit this run of nodes to introduce // whitespace breaks as necessary so that HTML5 browsers get that // same opportunity when Tidy is removed. $s = null; $nowrapNodes = []; '@phan-var array<array{node:Node,tidybug:bool,hasLeadingWS:bool}> $nowrapNodes'; $startNode = $node; $haveTidyBug = false; $runLength = 0; // <br>, <wbr>, <hr> break a line while ( $node && !DOMUtils::isRemexBlockNode( $node ) && !in_array( DOMCompat::nodeName( $node ), [ 'hr', 'br', 'wbr' ], true ) ) { if ( $node instanceof Text || !$this->hasNoWrapCSS( $node ) ) { // No CSS property that affects whitespace. $s = $node->textContent; if ( preg_match( '/^(\S*)\s/', $s, $m ) ) { // PORT-FIXME: non-ASCII whitespace? $runLength += strlen( $m[1] ); $nowrapNodes[] = [ 'node' => $node, 'tidybug' => false, 'hasLeadingWS' => ( preg_match( '/^\s/', $s ) === 1 ), // PORT-FIXME: non-ASCII whitespace? ]; break; } else { $nowrapNodes[] = [ 'node' => $node, 'tidybug' => false ]; $runLength += strlen( $s ); } } else { // Find last non-comment child of node $last = $node->lastChild; while ( $last instanceof Comment ) { $last = $last->previousSibling; } $bug = false; if ( $last instanceof Text && preg_match( '/\s$/D', $last->nodeValue ) // PORT-FIXME: non-ASCII whitespace? ) { // In this scenario, when Tidy hoists the whitespace to // after the node, that whitespace is not subject to the // nowrap CSS => browsers can break content there. // // But, non-Tidy libraries won't hoist the whitespace. // So, browsers don't have a place to break content. $bug = true; $haveTidyBug = true; } $nowrapNodes[] = [ 'node' => $node, 'tidybug' => $bug ]; $runLength += strlen( $node->textContent ); } // Don't cross template boundaries at the top-level if ( $tplInfo && $tplInfo->last === $node ) { // Exiting a top-level template break; } elseif ( !$tplInfo && WTUtils::findFirstEncapsulationWrapperNode( $node ) ) { // Entering a top-level template break; } // Move to the next non-comment sibling $node = $node->nextSibling; while ( $node instanceof Comment ) { $node = $node->nextSibling; } } $markProcessedNodes = static function () use ( &$nowrapNodes ) { // Helper foreach ( $nowrapNodes as $o ) { // Phan fails at applying the instanceof type restriction to the array member when analyzing the // following call, but is fine when it's copied to a local variable. $node = $o['node']; if ( $node instanceof Element ) { DOMDataUtils::getDataParsoid( $node )->setTempFlag( TempData::PROCESSED_TIDY_WS_BUG ); } } }; if ( !$haveTidyBug ) { // Mark processed nodes and bail $markProcessedNodes(); return; } // Find run before startNode that doesn't have a whitespace break $prev = $startNode->previousSibling; while ( $prev && !DOMUtils::isRemexBlockNode( $prev ) ) { if ( !( $prev instanceof Comment ) ) { $s = $prev->textContent; // Find the last \s in the string if ( preg_match( '/\s(\S*)$/D', $s, $m ) ) { // PORT-FIXME: non-ASCII whitespace here? $runLength += strlen( $m[1] ); break; } else { $runLength += strlen( $s ); } } $prev = $prev->previousSibling; } $lintConfig = $env->getLinterConfig(); $tidyWhitespaceBugMaxLength = $lintConfig['tidyWhitespaceBugMaxLength'] ?? 100; if ( $runLength < $tidyWhitespaceBugMaxLength ) { // Mark processed nodes and bail $markProcessedNodes(); return; } // For every node where Tidy hoists whitespace, // emit an event to flag a whitespace fixup opportunity. $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); $n = count( $nowrapNodes ) - 1; foreach ( $nowrapNodes as $i => $o ) { if ( $o['tidybug'] && $i < $n && empty( $nowrapNodes[$i + 1]['hasLeadingWS'] ) ) { $nowrapNode = $o['node']; // (see above) $lintObj = [ 'dsr' => self::findLintDSR( $tplLintInfo, $tplInfo, $nowrapNode instanceof Element ? DOMDataUtils::getDataParsoid( $nowrapNode )->dsr ?? null : null ), 'templateInfo' => $tplLintInfo, 'params' => [ 'node' => DOMCompat::nodeName( $o['node'] ), 'sibling' => DOMCompat::nodeName( $o['node']->nextSibling ) ] ]; $env->recordLint( 'tidy-whitespace-bug', $lintObj ); } } $markProcessedNodes(); } /** * Detect multiple-unclosed-formatting-tags errors. * * Since unclosed <small> and <big> tags accumulate their effects * in HTML5 parsers (unlike in Tidy where it seems to suppress * multiple unclosed elements of the same name), such pages will * break pretty spectacularly with Remex. * * Ex: https://it.wikipedia.org/wiki/Hubert_H._Humphrey_Metrodome?oldid=93017491#Note * * Lint category: `multiple-unclosed-formatting-tags` */ private function lintMultipleUnclosedFormattingTags( array $lints, Env $env ): void { $firstUnclosedTag = [ 'small' => null, 'big' => null ]; $multiUnclosedTagName = null; foreach ( $lints as $item ) { // Unclosed tags in tables don't leak out of the table if ( $item['type'] === 'missing-end-tag' && !$item['params']['inTable'] ) { if ( $item['params']['name'] === 'small' || $item['params']['name'] === 'big' ) { $tagName = $item['params']['name']; // @phan-suppress-next-line PhanPossiblyUndeclaredVariable if ( !$firstUnclosedTag[$tagName] ) { $firstUnclosedTag[$tagName] = $item; } else { $multiUnclosedTagName = $tagName; break; } } } } if ( $multiUnclosedTagName ) { $item = $firstUnclosedTag[$multiUnclosedTagName]; if ( isset( $item['dsr'] ) ) { $item['dsr'] = DomSourceRange::newFromJsonArray( $item['dsr'] ); } $env->recordLint( 'multiple-unclosed-formatting-tags', [ 'params' => $item['params'], 'dsr' => $item['dsr'], 'templateInfo' => $item['templateInfo'], ] ); } } /** * Post-process an array of lints */ private function postProcessLints( array $lints, Env $env ): void { $this->lintMultipleUnclosedFormattingTags( $lints, $env ); } /** * Get wikitext list item ancestor */ private function getWikitextListItemAncestor( ?Node $node ): ?Node { while ( $node && !DOMUtils::isListItem( $node ) ) { $node = $node->parentNode; } if ( $node && !WTUtils::isLiteralHTMLNode( $node ) && !WTUtils::fromExtensionContent( $node, 'references' ) ) { return $node; } return null; } /** * Lint a PHP parser bug. * * When an HTML table is nested inside a list, if any part of the table * is on a new line, the PHP parser misnests the list and the table. * Tidy fixes the misnesting one way (puts table inside/outside the list) * HTML5 parser fixes it another way (list expands to rest of the page!) * * Lint category: `multiline-html-table-in-list` */ private function lintMultilineHtmlTableInList( Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo ): void { $li = null; if ( !WTUtils::isLiteralHTMLNode( $node ) || DOMCompat::nodeName( $node ) !== 'table' || // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.Found !( $li = $this->getWikitextListItemAncestor( $node ) ) || !str_contains( DOMCompat::getOuterHTML( $node ), "\n" ) ) { return; } // We have an HTML table nested inside a list // that has a newline break in its outer HTML // => we are in trouble with the PHP Parser + Remex combo $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); $lintObj = [ 'dsr' => self::findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), 'templateInfo' => $tplLintInfo, 'params' => [ 'name' => 'table', 'ancestorName' => DOMCompat::nodeName( $li ), ], ]; $env->recordLint( 'multiline-html-table-in-list', $lintObj ); } /** * Log wikilinks or media in external links. * * HTML tags can be nested but this is not the case for <a> tags * which when nested outputs the <a> tags adjacent to each other * In the example below, [[Google]] is a wikilink that is nested * in the outer external link * [http://google.com This is [[Google]]'s search page] * * Linter category: `wikilink-in-extlink` */ private function lintWikilinksInExtlink( Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo ): void { if ( DOMCompat::nodeName( $node ) === 'a' && DOMUtils::hasRel( $node, "mw:ExtLink" ) && // Images in extlinks will end up with broken up extlinks inside the // <figure> DOM. Those have 'misnested' flag set on them. Ignore those. empty( $dp->misnested ) ) { $next = $node->nextSibling; $lintError = $next instanceof Element && !empty( DOMDataUtils::getDataParsoid( $next )->misnested ) && // This check may not be necessary but ensures that we are // really in a link-in-link misnested scenario. DOMUtils::treeHasElement( $next, 'a', true ); // Media as opposed to most instances of img (barring the link= trick), don't result // in misnesting according the html5 spec since we're actively suppressing links in // their structure. However, since timed media is inherently clickable, being nested // in an extlink could surprise a user clicking on it by navigating away from the page. if ( !$lintError ) { DOMUtils::visitDOM( $node, static function ( $element ) use ( &$lintError ) { if ( $element instanceof Element && ( DOMCompat::nodeName( $element ) === 'audio' || DOMCompat::nodeName( $element ) === 'video' ) ) { $lintError = true; } } ); } if ( $lintError ) { $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); $lintObj = [ 'dsr' => self::findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), 'templateInfo' => $tplLintInfo, ]; $env->recordLint( 'wikilink-in-extlink', $lintObj ); } } } private function recordLargeTablesLint( Env $env, ?stdClass $tplInfo, Element $node, int $numColumns, int $columnsMax ): void { $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); $lintObj = [ 'dsr' => self::findLintDSR( $tplLintInfo, $tplInfo, DOMDataUtils::getDataParsoid( $node )->dsr ?? null ), 'templateInfo' => $tplLintInfo, 'params' => [ 'name' => 'table', 'columns' => $numColumns, 'columnsMax' => $columnsMax, ], ]; $env->recordLint( 'large-tables', $lintObj ); } /** * TODO: In the future, this may merit being moved to DOMUtils * along with its "previous" variant. */ private function skipNonElementNodes( ?Node $n ): ?Element { while ( $n && !( $n instanceof Element ) ) { $n = $n->nextSibling; } return $n; } /** * Lint large tables. * * Identify articles having overly-large tables * to help editors optimize their articles. * * Linter category: `large-tables` */ private function lintLargeTables( Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo ): void { if ( DOMCompat::nodeName( $node ) !== 'table' ) { return; } // Skip tables that have nested tables in them as they are likely // to be used for layout and not for data representation. // We may check nested tables in the next iteration of this lint. $nestedTables = $node->getElementsByTagName( 'table' ); if ( $nestedTables->length > 0 ) { return; } $lintConfig = $env->getLinterConfig(); $maxColumns = $lintConfig['maxTableColumnHeuristic'] ?? 5; $maxRowsToCheck = $lintConfig['maxTableRowsToCheck'] ?? 10; $trCount = 0; $tbody = DOMCompat::querySelector( $node, 'tbody' ); // empty table if ( !$tbody ) { return; } $tr = self::skipNonElementNodes( $tbody->firstChild ); while ( $tr && $trCount < $maxRowsToCheck ) { $numTh = $tr->getElementsByTagName( 'th' )->length; if ( $numTh > $maxColumns ) { $this->recordLargeTablesLint( $env, $tplInfo, $node, $numTh, $maxColumns ); return; } $numTd = $tr->getElementsByTagName( 'td' )->length; if ( $numTd > $maxColumns ) { $this->recordLargeTablesLint( $env, $tplInfo, $node, $numTd, $maxColumns ); return; } $tr = self::skipNonElementNodes( $tr->nextSibling ); $trCount++; } } /** * Log inline background color style rules without a color style rule. * * This function identifies elements with inline style attributes * that have background color set but don't have a color style rule. * It records linter events for such elements to help editors make * their articles comply with WCAG color contrast rules. * * Linter category: `night-mode-unaware-background-color` */ private function lintNightModeUnawareBackgroundColor( Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo ): void { // Get inline style attribute value $styleAttrValue = DOMCompat::getAttribute( $node, 'style' ); // Check if background color is set but font color is not if ( ( $styleAttrValue !== null ) && preg_match( '/(^|;)\s*background(-color)?\s*:/i', $styleAttrValue ) && !preg_match( '/(^|;)\s*color\s*:/i', $styleAttrValue ) ) { $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); $lintObj = [ 'dsr' => self::findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), 'templateInfo' => $tplLintInfo, ]; $env->recordLint( 'night-mode-unaware-background-color', $lintObj ); } } /** * Lint for missing image alt text * * Linter category: `missing-image-alt-text` */ private function lintMissingAltText( Env $env, Element $c, DataParsoid $dp, ?stdClass $tplInfo ): void { if ( !WTUtils::isGeneratedFigure( $c ) ) { return; } // Extract the media element in its standard place $media = $c->firstChild->firstChild ?? null; if ( !( $media instanceof Element ) || DOMCompat::nodeName( $media ) !== 'img' ) { // Videos and such are handled differently; check only // simple image output for alt text. return; } if ( $media->hasAttribute( 'alt' ) ) { // Present and accounted for, either via explicit markup // or filling in from an inline caption or other future // source. // // Note that an explicit empty alt text will be counted // as present, as this may be done deliberately for // spacer images or similar. return; } // Follow the parent tree looking for aria-hidden=true or equivalent roles for ( $node = $media; $node->parentNode; $node = $node->parentNode ) { $hidden = strtolower( DOMCompat::getAttribute( $node, 'aria-hidden' ) ?? '' ); $role = strtolower( DOMCompat::getAttribute( $node, 'role' ) ?? '' ); if ( $hidden === 'true' || $role === 'presentation' || $role === 'none' ) { // This entire subtree is excluded from the accessibility tree. return; } } $resource = DOMCompat::getAttribute( $media, 'resource' ) ?? ''; $file = basename( urldecode( $resource ) ); $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); $lintObj = [ 'dsr' => self::findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), 'templateInfo' => $tplLintInfo, 'params' => [ 'file' => $file, ] ]; $env->recordLint( 'missing-image-alt-text', $lintObj ); } /** * Lint duplicate ids in the page * * Linter category: `duplicate-ids` */ private function lintDuplicateIds( Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo ) { $id = DOMCompat::getAttribute( $node, 'id' ); if ( $id === null ) { return; } if ( !isset( $this->seenIds[$id] ) ) { $this->seenIds[$id] = 1; return; } $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); $lintObj = [ 'dsr' => self::findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), 'templateInfo' => $tplLintInfo, 'params' => [ 'id' => $id ], ]; $env->recordLint( 'duplicate-ids', $lintObj ); } /** * Log wikitext fixups */ private function logWikitextFixups( Element $node, Env $env, ?stdClass $tplInfo ): void { $dp = DOMDataUtils::getDataParsoid( $node ); $this->lintTreeBuilderFixup( $env, $node, $dp, $tplInfo ); $this->lintDeletableTableTag( $env, $node, $dp, $tplInfo ); // For T161341 $this->lintPWrapBugWorkaround( $env, $node, $dp, $tplInfo ); // For T161306 $this->lintObsoleteTag( $env, $node, $dp, $tplInfo ); $this->lintBogusImageOptions( $env, $node, $dp, $tplInfo ); $this->lintTidyWhitespaceBug( $env, $node, $dp, $tplInfo ); $this->lintMiscTidyReplacementIssues( $env, $node, $dp, $tplInfo ); $this->lintMultilineHtmlTableInList( $env, $node, $dp, $tplInfo ); $this->lintWikilinksInExtlink( $env, $node, $dp, $tplInfo ); $this->lintLargeTables( $env, $node, $dp, $tplInfo ); $this->lintNightModeUnawareBackgroundColor( $env, $node, $dp, $tplInfo ); $this->lintFostered( $env, $node, $dp, $tplInfo ); $this->lintMissingAltText( $env, $node, $dp, $tplInfo ); $this->lintDuplicateIds( $env, $node, $dp, $tplInfo ); } /** * Walk the DOM and compute lints for the entire tree. * - When we enter encapsulated content (templates or extensions), * compute "tplInfo" (misnamed given that it can be an extension) * so that lints from the templates' content can be mapped back * to the transclusion that generated them. * - When we process extensions, if we have a lint handler for the * extension, let the extension's lint handler compute lints. */ private function findLints( Node $root, Env $env, ?stdClass $tplInfo = null ): void { $node = $root->firstChild; while ( $node !== null ) { if ( !$node instanceof Element ) { $node = $node->nextSibling; continue; } // !tplInfo check is to protect against templated content in // extensions which might in turn be nested in templated content. if ( !$tplInfo && WTUtils::isFirstEncapsulationWrapperNode( $node ) ) { $aboutSibs = WTUtils::getAboutSiblings( $node, DOMCompat::getAttribute( $node, 'about' ) ); $tplInfo = (object)[ 'first' => $node, 'last' => end( $aboutSibs ), 'dsr' => DOMDataUtils::getDataParsoid( $node )->dsr ?? null, // FIXME: This is not being used. Instead the code is recomputing // this info in findEnclosingTemplateName. 'isTemplated' => DOMUtils::hasTypeOf( $node, 'mw:Transclusion' ), ]; } $handled = false; // Let native extensions lint their content $nativeExt = WTUtils::getNativeExt( $env, $node ); if ( $nativeExt ) { if ( !$this->extApi ) { $this->extApi = new ParsoidExtensionAPI( $env ); } $handled = $nativeExt->lintHandler( $this->extApi, $node, function ( $extRootNode ) use ( $env, $tplInfo ) { $this->findLints( $extRootNode, $env, empty( $tplInfo->isTemplated ) ? null : $tplInfo ); } ); // NOTE: See the note in WrapSectionsState::shouldOmitFromTOC() // but we've assumed extension content is contained in a single // wrapper node and it's safe to move to $node->nextSibling. } // Default node handler if ( $handled === false ) { // Lint this node $this->logWikitextFixups( $node, $env, $tplInfo ); // Lint subtree $this->findLints( $node, $env, $tplInfo ); } if ( $tplInfo && $tplInfo->last === $node ) { $tplInfo = null; } $node = $node->nextSibling; } } /** * This is only invoked on the top-level document * @inheritDoc */ public function run( Env $env, Node $root, array $options = [], bool $atTopLevel = false ): void { if ( !$env->linting() ) { return; } // Track time spent linting so we can evaluate benefits // of migrating this code off the critical path to its own // post processor. $metrics = $env->getSiteConfig()->metrics(); $timer = null; if ( $metrics ) { $timer = Timing::start( $metrics ); } $this->findLints( $root, $env ); $this->postProcessLints( $env->getLints(), $env ); if ( $metrics ) { $timer->end( "linting" ); } } } PK ! �Çcn$ n$ AddMetaData.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; use Closure; use DateTime; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\Parsoid; use Wikimedia\Parsoid\Utils\DOMCompat; use Wikimedia\Parsoid\Utils\DOMDataUtils; use Wikimedia\Parsoid\Utils\DOMUtils; use Wikimedia\Parsoid\Utils\PHPUtils; use Wikimedia\Parsoid\Utils\Utils; use Wikimedia\Parsoid\Wt2Html\DOMPostProcessor; use Wikimedia\Parsoid\Wt2Html\Wt2HtmlDOMProcessor; class AddMetaData implements Wt2HtmlDOMProcessor { private array $metadataMap; private ?DOMPostProcessor $parentPipeline; public function __construct( ?DOMPostProcessor $domPP ) { $this->parentPipeline = $domPP; // map from mediawiki metadata names to RDFa property names $this->metadataMap = [ 'ns' => [ 'property' => 'mw:pageNamespace', 'content' => '%d', ], 'id' => [ 'property' => 'mw:pageId', 'content' => '%d', ], // DO NOT ADD rev_user, rev_userid, and rev_comment (See T125266) // 'rev_revid' is used to set the overall subject of the document, we don't // need to add a specific <meta> or <link> element for it. 'rev_parentid' => [ 'rel' => 'dc:replaces', 'resource' => 'mwr:revision/%d', ], 'rev_timestamp' => [ 'property' => 'dc:modified', 'content' => static function ( $m ) { # Convert from TS_MW ("mediawiki timestamp") format $dt = DateTime::createFromFormat( 'YmdHis', $m['rev_timestamp'] ); # Note that DateTime::ISO8601 is not actually ISO8601, alas. return $dt->format( 'Y-m-d\TH:i:s.000\Z' ); }, ], 'rev_sha1' => [ 'property' => 'mw:revisionSHA1', 'content' => '%s', ] ]; } private function updateBodyClasslist( Element $body, Env $env ): void { $dir = $env->getPageConfig()->getPageLanguageDir(); $bodyCL = DOMCompat::getClassList( $body ); $bodyCL->add( 'mw-content-' . $dir ); $bodyCL->add( 'sitedir-' . $dir ); $bodyCL->add( $dir ); $body->setAttribute( 'dir', $dir ); // Set 'mw-body-content' directly on the body. // This is the designated successor for #bodyContent in core skins. $bodyCL->add( 'mw-body-content' ); // Set 'parsoid-body' to add the desired layout styling from Vector. $bodyCL->add( 'parsoid-body' ); // Also, add the 'mediawiki' class. // Some MediaWiki:Common.css seem to target this selector. $bodyCL->add( 'mediawiki' ); // Set 'mw-parser-output' directly on the body. // Templates target this class as part of the TemplateStyles RFC // FIXME: This isn't expected to be found on the same element as the // body class above, since some css targets it as a descendant. // In visual diff'ing, we migrate the body contents to a wrapper div // with this class to reduce visual differences. Consider getting // rid of it. $bodyCL->add( 'mw-parser-output' ); // Set the parsoid version on the body, for consistency with // the wrapper div. $body->setAttribute( 'data-mw-parsoid-version', Parsoid::version() ); $body->setAttribute( 'data-mw-html-version', Parsoid::defaultHTMLVersion() ); } /** * @inheritDoc */ public function run( Env $env, Node $root, array $options = [], bool $atTopLevel = false ): void { $title = $env->getContextTitle(); $document = $root->ownerDocument; // Set the charset in the <head> first. // This also adds the <head> element if it was missing. DOMUtils::appendToHead( $document, 'meta', [ 'charset' => 'utf-8' ] ); // add mw: and mwr: RDFa prefixes $prefixes = [ 'dc: http://purl.org/dc/terms/', 'mw: http://mediawiki.org/rdf/' ]; $document->documentElement->setAttribute( 'prefix', implode( ' ', $prefixes ) ); // (From wfParseUrl in core:) // Protocol-relative URLs are handled really badly by parse_url(). // It's so bad that the easiest way to handle them is to just prepend // 'https:' and strip the protocol out later. $baseURI = $env->getSiteConfig()->baseURI(); $wasRelative = substr( $baseURI, 0, 2 ) == '//'; if ( $wasRelative ) { $baseURI = "https:$baseURI"; } // add 'https://' to baseURI if it was missing $pu = parse_url( $baseURI ); $mwrPrefix = ( !empty( $pu['scheme'] ) ? '' : 'https://' ) . $baseURI . 'Special:Redirect/'; ( DOMCompat::getHead( $document ) )->setAttribute( 'prefix', 'mwr: ' . $mwrPrefix ); // add <head> content based on page meta data: // Add page / revision metadata to the <head> // PORT-FIXME: We will need to do some refactoring to eliminate // this hardcoding. Probably even merge this into metadataMap $pageConfig = $env->getPageConfig(); $revProps = [ 'id' => $pageConfig->getPageId(), 'ns' => $title->getNamespace(), 'rev_parentid' => $pageConfig->getParentRevisionId(), 'rev_revid' => $pageConfig->getRevisionId(), 'rev_sha1' => $pageConfig->getRevisionSha1(), 'rev_timestamp' => $pageConfig->getRevisionTimestamp() ]; foreach ( $revProps as $key => $value ) { // generate proper attributes for the <meta> or <link> tag if ( $value === null || $value === '' || !isset( $this->metadataMap[$key] ) ) { continue; } $attrs = []; $mdm = $this->metadataMap[$key]; /** FIXME: The JS side has a bunch of other checks here */ foreach ( $mdm as $k => $v ) { // evaluate a function, or perform sprintf-style formatting, or // use string directly, depending on value in metadataMap if ( $v instanceof Closure ) { $v = $v( $revProps ); } elseif ( strpos( $v, '%' ) !== false ) { // @phan-suppress-next-line PhanPluginPrintfVariableFormatString $v = sprintf( $v, $value ); } $attrs[$k] = $v; } // <link> is used if there's a resource or href attribute. DOMUtils::appendToHead( $document, isset( $attrs['resource'] ) || isset( $attrs['href'] ) ? 'link' : 'meta', $attrs ); } if ( $revProps['rev_revid'] ) { $document->documentElement->setAttribute( 'about', $mwrPrefix . 'revision/' . $revProps['rev_revid'] ); } // Normalize before comparison if ( $title->isSameLinkAs( $env->getSiteConfig()->mainPageLinkTarget() ) ) { DOMUtils::appendToHead( $document, 'meta', [ 'property' => 'isMainPage', 'content' => 'true' /* HTML attribute values should be strings */ ] ); } // Set the parsoid content-type strings // FIXME: Should we be using http-equiv for this? DOMUtils::appendToHead( $document, 'meta', [ 'property' => 'mw:htmlVersion', 'content' => $env->getOutputContentVersion() ] ); // Temporary backward compatibility for clients // This could be skipped if we support a version downgrade path // with a major version bump. DOMUtils::appendToHead( $document, 'meta', [ 'property' => 'mw:html:version', 'content' => $env->getOutputContentVersion() ] ); $expTitle = explode( '/', $title->getPrefixedDBKey() ); $expTitle = array_map( static function ( $comp ) { return PHPUtils::encodeURIComponent( $comp ); }, $expTitle ); DOMUtils::appendToHead( $document, 'link', [ 'rel' => 'dc:isVersionOf', 'href' => $env->getSiteConfig()->baseURI() . implode( '/', $expTitle ) ] ); // Add base href pointing to the wiki root DOMUtils::appendToHead( $document, 'base', [ 'href' => $env->getSiteConfig()->baseURI() ] ); // Stick data attributes in the head if ( $env->pageBundle ) { DOMDataUtils::injectPageBundle( $document, DOMDataUtils::getPageBundle( $document ) ); } // PageConfig guarantees language will always be non-null. $lang = $env->getPageConfig()->getPageLanguageBcp47(); $body = DOMCompat::getBody( $document ); $body->setAttribute( 'lang', $lang->toBcp47Code() ); $this->updateBodyClasslist( $body, $env ); // T324431: Note that this is *not* the displaytitle, and that // the title element contents are plaintext *not* HTML DOMCompat::setTitle( $document, $title->getPrefixedText() ); $env->getSiteConfig()->exportMetadataToHeadBcp47( $document, $env->getMetadata(), $title->getPrefixedText(), $lang ); // Indicate whether LanguageConverter is enabled, so that downstream // caches can split on variant (if necessary) DOMUtils::appendToHead( $document, 'meta', [ 'http-equiv' => 'content-language', // Note that this is "wrong": we should be returning // $env->htmlContentLanguageBcp47()->toBcp47Code() directly // but for back-compat we'll return the "old" mediawiki-internal // code for now 'content' => Utils::bcp47ToMwCode( # T323052: remove this call $env->htmlContentLanguageBcp47()->toBcp47Code() ), ] ); DOMUtils::appendToHead( $document, 'meta', [ 'http-equiv' => 'vary', 'content' => $env->htmlVary() ] ); if ( $env->profiling() && $this->parentPipeline ) { $body = DOMCompat::getBody( $document ); $body->appendChild( $body->ownerDocument->createTextNode( "\n" ) ); $body->appendChild( $body->ownerDocument->createComment( $this->parentPipeline->getTimeProfile() ) ); $body->appendChild( $body->ownerDocument->createTextNode( "\n" ) ); } if ( $env->hasDumpFlag( 'wt2html:limits' ) ) { /* * PORT-FIXME: Not yet implemented $env->printWt2HtmlResourceUsage( [ 'HTML Size' => strlen( DOMCompat::getOuterHTML( $document->documentElement ) ) ] ); */ } } } PK ! �F=� � WrapAnnotations.phpnu �Iw�� <?php declare( strict_types=1 ); namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\Wt2Html\Wt2HtmlDOMProcessor; class WrapAnnotations implements Wt2HtmlDOMProcessor { /** * Encapsulate template-affected DOM structures by wrapping text nodes into * spans and adding RDFa attributes to all subtree roots according to * http://www.mediawiki.org/wiki/Parsoid/RDFa_vocabulary#Template_content * @inheritDoc */ public function run( Env $env, Node $root, array $options = [], bool $atTopLevel = false ): void { if ( $env->hasAnnotations ) { $op = new AnnotationDOMRangeBuilder( $root->ownerDocument, $options['frame'] ); $op->execute( $root ); } } } PK ! W�%k k MigrateTemplateMarkerMetas.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\DOM\DocumentFragment; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\Utils\DiffDOMUtils; use Wikimedia\Parsoid\Utils\DOMCompat; use Wikimedia\Parsoid\Utils\DOMDataUtils; use Wikimedia\Parsoid\Utils\DOMUtils; use Wikimedia\Parsoid\Utils\WTUtils; use Wikimedia\Parsoid\Wikitext\Consts; use Wikimedia\Parsoid\Wt2Html\Wt2HtmlDOMProcessor; class MigrateTemplateMarkerMetas implements Wt2HtmlDOMProcessor { private function migrateFirstChild( Node $firstChild ): bool { if ( WTUtils::isTplEndMarkerMeta( $firstChild ) ) { return true; } if ( WTUtils::isTplStartMarkerMeta( $firstChild ) ) { '@phan-var Element $firstChild'; // @var Element $firstChild $docDataBag = DOMDataUtils::getBag( $firstChild->ownerDocument ); $about = DOMCompat::getAttribute( $firstChild, 'about' ); $startDepth = $docDataBag->transclusionMetaTagDepthMap[$about]['start']; $endDepth = $docDataBag->transclusionMetaTagDepthMap[$about]['end']; return $startDepth > $endDepth; } return false; } private function migrateLastChild( Node $lastChild ): bool { if ( WTUtils::isTplStartMarkerMeta( $lastChild ) ) { return true; } if ( WTUtils::isTplEndMarkerMeta( $lastChild ) ) { '@phan-var Element $lastChild'; // @var Element $lastChild $docDataBag = DOMDataUtils::getBag( $lastChild->ownerDocument ); $about = DOMCompat::getAttribute( $lastChild, 'about' ); $startDepth = $docDataBag->transclusionMetaTagDepthMap[$about]['start']; $endDepth = $docDataBag->transclusionMetaTagDepthMap[$about]['end']; return $startDepth < $endDepth; } return false; } private function updateDepths( Element $elt ): void { // Update depths $docDataBag = DOMDataUtils::getBag( $elt->ownerDocument ); $about = DOMCompat::getAttribute( $elt, 'about' ); if ( WTUtils::isTplEndMarkerMeta( $elt ) ) { // end depth $docDataBag->transclusionMetaTagDepthMap[$about]['end']--; } else { // start depth $docDataBag->transclusionMetaTagDepthMap[$about]['start']--; } } /** * The goal of this pass is to assist the WrapTemplates pass * by using some simple heuristics to bring the DOM into a more * canonical form. There is no correctness issue with WrapTemplates * wrapping a wider range of content than what a template generated. * These heuristics can be evolved as needed. * * Given the above considerations, we are going to consider migration * possibilities only where the migration won't lead to additional * untemplated content getting pulled into the template wrapper. * * The simplest heuristics that satisfy this constraint are: * - Only examine first/last child of a node. * - We relax the first/last child constraint by ignoring * separator nodes (comments, whitespace) but this is * something worth revisiting in the future. * - Only migrate upwards if the node's start/end tag (barrier) * comes from zero-width-wikitext. * - If the start meta is the last child OR if the end meta is * the first child, migrate up. * - If the start meta is the first child OR if the end meta is * the last child, there is no benefit to migrating the meta tags * up if both the start and end metas are at the same tree depth. * - In some special cases, it might be possible to migrate * metas downward rather than upward. Migrating downwards has * wt2wt corruption implications if done incorrectly. So, we * aren't considering this possibility right now. * * @param Element|DocumentFragment $node * @param Env $env */ private function doMigrate( Node $node, Env $env ): void { $c = $node->firstChild; while ( $c ) { $sibling = $c->nextSibling; if ( $c->hasChildNodes() ) { '@phan-var Element $c'; // @var Element $c $this->doMigrate( $c, $env ); } $c = $sibling; } // No migration out of fragment if ( DOMUtils::atTheTop( $node ) ) { return; } // Check if $node is a fostered node $fostered = !empty( DOMDataUtils::getDataParsoid( $node )->fostered ); $firstChild = DiffDOMUtils::firstNonSepChild( $node ); if ( $firstChild && $this->migrateFirstChild( $firstChild ) ) { // We can migrate the meta-tag across this node's start-tag barrier only // if that start-tag is zero-width, or auto-inserted. $tagWidth = Consts::$WtTagWidths[DOMCompat::nodeName( $node )] ?? null; DOMUtils::assertElt( $node ); if ( ( $tagWidth && $tagWidth[0] === 0 && !WTUtils::isLiteralHTMLNode( $node ) ) || !empty( DOMDataUtils::getDataParsoid( $node )->autoInsertedStart ) ) { $sentinel = $firstChild; do { $firstChild = $node->firstChild; $node->parentNode->insertBefore( $firstChild, $node ); if ( $fostered && $firstChild instanceof Element ) { // $firstChild is being migrated out of a fostered node // So, mark $lastChild itself fostered! DOMDataUtils::getDataParsoid( $firstChild )->fostered = true; } } while ( $sentinel !== $firstChild ); $this->updateDepths( $firstChild ); } } $lastChild = DiffDOMUtils::lastNonSepChild( $node ); if ( $lastChild && $this->migrateLastChild( $lastChild ) ) { // We can migrate the meta-tag across this node's end-tag barrier only // if that end-tag is zero-width, or auto-inserted. $tagWidth = Consts::$WtTagWidths[DOMCompat::nodeName( $node )] ?? null; DOMUtils::assertElt( $node ); if ( ( $tagWidth && $tagWidth[1] === 0 && !WTUtils::isLiteralHTMLNode( $node ) ) || ( !empty( DOMDataUtils::getDataParsoid( $node )->autoInsertedEnd ) && // Except, don't migrate out of a table since the end meta // marker may have been fostered and this is more likely to // result in a flipped range that isn't enclosed. DOMCompat::nodeName( $node ) !== 'table' ) ) { $sentinel = $lastChild; do { $lastChild = $node->lastChild; $node->parentNode->insertBefore( $lastChild, $node->nextSibling ); if ( $fostered && $lastChild instanceof Element ) { // $lastChild is being migrated out of a fostered node // So, mark $lastChild itself fostered! DOMDataUtils::getDataParsoid( $lastChild )->fostered = true; } } while ( $sentinel !== $lastChild ); $this->updateDepths( $lastChild ); } } } /** * @inheritDoc */ public function run( Env $env, Node $root, array $options = [], bool $atTopLevel = false ): void { // Don't run this in template content if ( $options['inTemplate'] ) { return; } if ( $root instanceof Element || $root instanceof DocumentFragment ) { $this->doMigrate( $root, $env ); } } } PK ! ��� � ProcessTreeBuilderFixups.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\Utils\DiffDOMUtils; use Wikimedia\Parsoid\Utils\DOMDataUtils; use Wikimedia\Parsoid\Utils\WTUtils; use Wikimedia\Parsoid\Wt2Html\Frame; use Wikimedia\Parsoid\Wt2Html\Wt2HtmlDOMProcessor; class ProcessTreeBuilderFixups implements Wt2HtmlDOMProcessor { private static function removeAutoInsertedEmptyTags( Frame $frame, Node $node ): void { $c = $node->firstChild; while ( $c !== null ) { // FIXME: Encapsulation only happens after this phase, so you'd think // we wouldn't encounter any, but the html pre tag inserts extension // content directly, rather than passing it through as a fragment for // later unpacking. Same as above. if ( WTUtils::isEncapsulationWrapper( $c ) ) { $c = WTUtils::skipOverEncapsulatedContent( $c ); continue; } if ( $c instanceof Element ) { self::removeAutoInsertedEmptyTags( $frame, $c ); $dp = DOMDataUtils::getDataParsoid( $c ); // We do this down here for all elements since the quote transformer // also marks up elements as auto-inserted and we don't want to be // constrained by any conditions. Further, this pass should happen // before paragraph wrapping on the dom, since we don't want this // stripping to result in empty paragraphs. // Delete empty auto-inserted elements if ( !empty( $dp->autoInsertedStart ) && !empty( $dp->autoInsertedEnd ) && ( !$c->hasChildNodes() || ( DiffDOMUtils::hasNChildren( $c, 1 ) && !( $c->firstChild instanceof Element ) && preg_match( '/^\s*$/D', $c->textContent ) ) ) ) { $next = $c->nextSibling; if ( $c->firstChild ) { // migrate the ws out $c->parentNode->insertBefore( $c->firstChild, $c ); } $c->parentNode->removeChild( $c ); $c = $next; continue; } } $c = $c->nextSibling; } } /** * @inheritDoc */ public function run( Env $env, Node $root, array $options = [], bool $atTopLevel = false ): void { self::removeAutoInsertedEmptyTags( $options['frame'], $root ); } } PK ! >��B�f �f AddMediaInfo.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; use Wikimedia\Assert\Assert; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\Core\ContentMetadataCollectorStringSets as CMCSS; use Wikimedia\Parsoid\Core\Sanitizer; use Wikimedia\Parsoid\DOM\DocumentFragment; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\Html2Wt\WTSUtils; use Wikimedia\Parsoid\NodeData\DataMw; use Wikimedia\Parsoid\NodeData\DataMwError; use Wikimedia\Parsoid\Utils\ContentUtils; use Wikimedia\Parsoid\Utils\DOMCompat; use Wikimedia\Parsoid\Utils\DOMDataUtils; use Wikimedia\Parsoid\Utils\DOMUtils; use Wikimedia\Parsoid\Utils\Title; use Wikimedia\Parsoid\Utils\WTUtils; use Wikimedia\Parsoid\Wikitext\Consts; use Wikimedia\Parsoid\Wt2Html\PegTokenizer; use Wikimedia\Parsoid\Wt2Html\Wt2HtmlDOMProcessor; class AddMediaInfo implements Wt2HtmlDOMProcessor { /** * Extract the dimensions for media. * * @param Env $env * @param array $attrs * @param array $info * @phan-param array{size:array{height?:int,width?:int},format:string} $attrs * @return array */ private static function handleSize( Env $env, array $attrs, array $info ): array { $height = $info['height']; $width = $info['width']; Assert::invariant( is_numeric( $height ) && $height !== NAN, 'Expected $height as a valid number' ); Assert::invariant( is_numeric( $width ) && $width !== NAN, 'Expected $width as a valid number' ); if ( !empty( $info['thumburl'] ) && !empty( $info['thumbheight'] ) ) { $height = $info['thumbheight']; } if ( !empty( $info['thumburl'] ) && !empty( $info['thumbwidth'] ) ) { $width = $info['thumbwidth']; } // Audio files don't have dimensions, so we fallback to these arbitrary // defaults, and the "mw-default-audio-height" class is added. if ( $info['mediatype'] === 'AUDIO' ) { // FIXME: TMH uses 23 but VE wants 32 $height = /* height || */32; // Arguably, audio should respect a defined height $width = max( 35, $width ?: $env->getSiteConfig()->widthOption() ); } // Handle client-side upscaling (including 'border') $mustRender = $info['mustRender'] ?? $info['mediatype'] !== 'BITMAP'; // Calculate the scaling ratio from the user-specified width and height $ratio = null; if ( !empty( $attrs['dims']['height'] ) && !empty( $info['height'] ) ) { $ratio = $attrs['dims']['height'] / $info['height']; } if ( !empty( $attrs['dims']['width'] ) && !empty( $info['width'] ) ) { $r = $attrs['dims']['width'] / $info['width']; $ratio = ( $ratio === null || $r < $ratio ) ? $r : $ratio; } // If the user requested upscaling, then this is denied in the thumbnail // and frameless format, except for files with mustRender. if ( $ratio !== null && $ratio > 1 && !$mustRender && ( $attrs['format'] === 'Thumb' || $attrs['format'] === 'Frameless' ) ) { // Upscaling denied $height = $info['height']; $width = $info['width']; } return [ 'height' => $height, 'width' => $width ]; } /** * This is a port of TMH's parseTimeString() * * @param string $timeString * @param int|float|null $length * @return int|float|null */ private static function parseTimeString( string $timeString, $length = null ) { $parts = explode( ':', $timeString ); $time = 0; $countParts = count( $parts ); if ( $countParts > 3 ) { return null; } for ( $i = 0; $i < $countParts; $i++ ) { if ( !is_numeric( $parts[$i] ) ) { return null; } $time += floatval( $parts[$i] ) * pow( 60, $countParts - 1 - $i ); } if ( $time < 0 ) { $time = 0; } elseif ( $length !== null ) { if ( $time > $length ) { $time = $length - 1; } } return $time; } /** * Handle media fragments * https://www.w3.org/TR/media-frags/ * * @param array $info * @param DataMw $dataMw * @return string */ private static function parseFrag( array $info, DataMw $dataMw ): string { $frag = ''; $starttime = WTSUtils::getAttrFromDataMw( $dataMw, 'starttime', true ); $endtime = WTSUtils::getAttrFromDataMw( $dataMw, 'endtime', true ); if ( $starttime || $endtime ) { $frag .= '#t='; if ( $starttime ) { $time = self::parseTimeString( $starttime->value['txt'], $info['duration'] ?? null ); if ( $time !== null ) { $frag .= $time; } } if ( $endtime ) { $time = self::parseTimeString( $endtime->value['txt'], $info['duration'] ?? null ); if ( $time !== null ) { $frag .= ',' . $time; } } } return $frag; } private static function addSources( Element $elt, array $info, DataMw $dataMw, bool $hasDimension ): void { $doc = $elt->ownerDocument; $frag = self::parseFrag( $info, $dataMw ); if ( is_array( $info['thumbdata']['derivatives'] ?? null ) ) { // BatchAPI's `getAPIData` $derivatives = $info['thumbdata']['derivatives']; } elseif ( is_array( $info['derivatives'] ?? null ) ) { // "videoinfo" prop $derivatives = $info['derivatives']; } else { $derivatives = [ [ 'src' => $info['url'], 'type' => $info['mime'], 'width' => (string)$info['width'], 'height' => (string)$info['height'], ], ]; } foreach ( $derivatives as $o ) { $source = $doc->createElement( 'source' ); $source->setAttribute( 'src', $o['src'] . $frag ); $source->setAttribute( 'type', $o['type'] ); // T339375 $fromFile = isset( $o['transcodekey'] ) ? '' : '-file'; if ( $hasDimension ) { $source->setAttribute( 'data' . $fromFile . '-width', (string)$o['width'] ); $source->setAttribute( 'data' . $fromFile . '-height', (string)$o['height'] ); } if ( !$fromFile ) { $source->setAttribute( 'data-transcodekey', $o['transcodekey'] ); } $elt->appendChild( $source ); } } private static function addTracks( Element $elt, array $info ): void { $doc = $elt->ownerDocument; if ( is_array( $info['thumbdata']['timedtext'] ?? null ) ) { // BatchAPI's `getAPIData` $timedtext = $info['thumbdata']['timedtext']; } elseif ( is_array( $info['timedtext'] ?? null ) ) { // "videoinfo" prop $timedtext = $info['timedtext']; } else { $timedtext = []; } foreach ( $timedtext as $o ) { $track = $doc->createElement( 'track' ); $track->setAttribute( 'kind', $o['kind'] ?? '' ); $track->setAttribute( 'type', $o['type'] ?? '' ); $track->setAttribute( 'src', $o['src'] ?? '' ); $track->setAttribute( 'srclang', $o['srclang'] ?? '' ); $track->setAttribute( 'label', $o['label'] ?? '' ); $track->setAttribute( 'data-mwtitle', $o['title'] ?? '' ); $track->setAttribute( 'data-dir', $o['dir'] ?? '' ); $elt->appendChild( $track ); } } /** * Abstract way to get the path for an image given an info object. * * @param array $info * @return string */ private static function getPath( array $info ) { $path = ''; if ( !empty( $info['thumburl'] ) ) { $path = $info['thumburl']; } elseif ( !empty( $info['url'] ) ) { $path = $info['url']; } return $path; } /** * @param Env $env * @param Element $span * @param array $attrs * @param array $info * @param DataMw $dataMw * @param Element $container * @param string|null $alt Unused, but matches the signature of handlers * @return Element */ private static function handleAudio( Env $env, Element $span, array $attrs, array $info, DataMw $dataMw, Element $container, ?string $alt ): Element { $doc = $span->ownerDocument; $audio = $doc->createElement( 'audio' ); $audio->setAttribute( 'controls', '' ); $audio->setAttribute( 'preload', 'none' ); $muted = WTSUtils::getAttrFromDataMw( $dataMw, 'muted', false ); if ( $muted ) { $audio->setAttribute( 'muted', '' ); } $loop = WTSUtils::getAttrFromDataMw( $dataMw, 'loop', false ); if ( $loop ) { $audio->setAttribute( 'loop', '' ); } // HACK(T295514): Until T313875 is implemented $audio->setAttribute( 'data-mw-tmh', '' ); $size = self::handleSize( $env, $attrs, $info ); DOMDataUtils::addNormalizedAttribute( $audio, 'height', (string)$size['height'], null, true ); DOMDataUtils::addNormalizedAttribute( $audio, 'width', (string)$size['width'], null, true ); $audio->setAttribute( 'style', "width: {$size['width']}px;" ); // Hardcoded until defined heights are respected. // See `AddMediaInfo::handleSize` DOMCompat::getClassList( $container )->add( 'mw-default-audio-height' ); self::copyOverAttribute( $audio, $span, 'resource' ); if ( $span->hasAttribute( 'lang' ) ) { self::copyOverAttribute( $audio, $span, 'lang' ); } if ( $info['duration'] ?? null ) { $audio->setAttribute( 'data-durationhint', (string)ceil( (float)$info['duration'] ) ); } self::addSources( $audio, $info, $dataMw, false ); self::addTracks( $audio, $info ); return $audio; } /** * @param Env $env * @param Element $span * @param array $attrs * @param array $info * @param DataMw $dataMw * @param Element $container * @param string|null $alt Unused, but matches the signature of handlers * @return Element */ private static function handleVideo( Env $env, Element $span, array $attrs, array $info, DataMw $dataMw, Element $container, ?string $alt ): Element { $doc = $span->ownerDocument; $video = $doc->createElement( 'video' ); if ( !empty( $info['thumburl'] ) ) { $video->setAttribute( 'poster', self::getPath( $info ) ); } $video->setAttribute( 'controls', '' ); $video->setAttribute( 'preload', 'none' ); $muted = WTSUtils::getAttrFromDataMw( $dataMw, 'muted', false ); if ( $muted ) { $video->setAttribute( 'muted', '' ); } $loop = WTSUtils::getAttrFromDataMw( $dataMw, 'loop', false ); if ( $loop ) { $video->setAttribute( 'loop', '' ); } // HACK(T295514): Until T313875 is implemented $video->setAttribute( 'data-mw-tmh', '' ); $size = self::handleSize( $env, $attrs, $info ); DOMDataUtils::addNormalizedAttribute( $video, 'height', (string)$size['height'], null, true ); DOMDataUtils::addNormalizedAttribute( $video, 'width', (string)$size['width'], null, true ); self::copyOverAttribute( $video, $span, 'resource' ); if ( $span->hasAttribute( 'lang' ) ) { self::copyOverAttribute( $video, $span, 'lang' ); } if ( $info['duration'] ?? null ) { $video->setAttribute( 'data-durationhint', (string)ceil( (float)$info['duration'] ) ); } self::addSources( $video, $info, $dataMw, true ); self::addTracks( $video, $info ); return $video; } /** * Set up the actual image structure, attributes, etc. * * @param Env $env * @param Element $span * @param array $attrs * @param array $info * @param DataMw $dataMw * @param Element $container * @param string|null $alt * @return Element */ private static function handleImage( Env $env, Element $span, array $attrs, array $info, DataMw $dataMw, Element $container, ?string $alt ): Element { $doc = $span->ownerDocument; $img = $doc->createElement( 'img' ); if ( $alt !== null ) { $img->setAttribute( 'alt', $alt ); } self::copyOverAttribute( $img, $span, 'resource' ); $img->setAttribute( 'src', self::getPath( $info ) ); $img->setAttribute( 'decoding', 'async' ); if ( $span->hasAttribute( 'lang' ) ) { self::copyOverAttribute( $img, $span, 'lang' ); } // Add (read-only) information about original file size (T64881) $img->setAttribute( 'data-file-width', (string)$info['width'] ); $img->setAttribute( 'data-file-height', (string)$info['height'] ); $img->setAttribute( 'data-file-type', strtolower( $info['mediatype'] ?? '' ) ); $size = self::handleSize( $env, $attrs, $info ); DOMDataUtils::addNormalizedAttribute( $img, 'height', (string)$size['height'], null, true ); DOMDataUtils::addNormalizedAttribute( $img, 'width', (string)$size['width'], null, true ); // Handle "responsive" images, i.e. srcset if ( !empty( $info['responsiveUrls'] ) ) { $candidates = []; foreach ( $info['responsiveUrls'] as $density => $url ) { $candidates[] = $url . ' ' . $density . 'x'; } if ( $candidates ) { $img->setAttribute( 'srcset', implode( ', ', $candidates ) ); } } return $img; } private static function makeErr( string $key, string $message, ?array $params = null ): DataMwError { return new DataMwError( $key, $params ?? [], $message ); } /** * @param Element $container * @param Element $span * @param list<DataMwError> $errs * @param DataMw $dataMw * @param ?string $alt */ private static function handleErrors( Element $container, Element $span, array $errs, DataMw $dataMw, ?string $alt ): void { if ( !DOMUtils::hasTypeOf( $container, 'mw:Error' ) ) { DOMUtils::addTypeOf( $container, 'mw:Error', true ); } if ( is_array( $dataMw->errors ?? null ) ) { $errs = array_merge( $dataMw->errors, $errs ); } $dataMw->errors = $errs; if ( $alt !== null ) { DOMCompat::replaceChildren( $span, $span->ownerDocument->createTextNode( $alt ) ); } } private static function copyOverAttribute( Element $elt, Element $span, string $attribute ): void { DOMDataUtils::addNormalizedAttribute( $elt, $attribute, DOMCompat::getAttribute( $span, $attribute ), WTSUtils::getAttributeShadowInfo( $span, $attribute )['value'] ); } private static function replaceAnchor( Env $env, PegTokenizer $urlParser, Element $container, Element $oldAnchor, array $attrs, DataMw $dataMw, bool $isImage, ?string $captionText, int $page, string $lang ): Element { $doc = $oldAnchor->ownerDocument; $attr = WTSUtils::getAttrFromDataMw( $dataMw, 'link', true ); if ( $isImage ) { $anchor = $doc->createElement( 'a' ); $addDescriptionLink = static function ( Title $title ) use ( $env, $anchor, $page, $lang ) { $href = $env->makeLink( $title ); $qs = []; if ( $page > 0 ) { $qs['page'] = $page; } if ( $lang ) { $qs['lang'] = $lang; } if ( $qs ) { $href .= '?' . http_build_query( $qs ); } $anchor->setAttribute( 'href', $href ); $anchor->setAttribute( 'class', 'mw-file-description' ); }; if ( $attr !== null ) { $discard = true; $val = $attr->value['txt']; if ( $val === '' ) { // No href if link= was specified $anchor = $doc->createElement( 'span' ); } elseif ( $urlParser->tokenizeURL( $val ) !== false ) { // An external link! $href = Sanitizer::cleanUrl( $env->getSiteConfig(), $val, 'external' ); $anchor->setAttribute( 'href', $href ); // Similar to AddLinkAttributes $extLinkAttribs = $env->getExternalLinkAttribs( $href ); foreach ( $extLinkAttribs as $key => $val ) { if ( $key === 'rel' ) { foreach ( $val as $v ) { DOMUtils::addRel( $anchor, $v ); } } else { $anchor->setAttribute( $key, $val ); } } } else { $link = $env->makeTitleFromText( $val, null, true ); if ( $link !== null ) { $anchor->setAttribute( 'href', $env->makeLink( $link ) ); $anchor->setAttribute( 'title', $link->getPrefixedText() ); } else { // Treat same as if link weren't present $addDescriptionLink( $attrs['title'] ); // but preserve for roundtripping $discard = false; } } if ( $discard ) { WTSUtils::getAttrFromDataMw( $dataMw, 'link', /* keep */false ); } } else { $addDescriptionLink( $attrs['title'] ); } } else { $anchor = $doc->createElement( 'span' ); } if ( $captionText ) { $anchor->setAttribute( 'title', $captionText ); } $oldAnchor->parentNode->replaceChild( $anchor, $oldAnchor ); return $anchor; } /** * @inheritDoc */ public function run( Env $env, Node $root, array $options = [], bool $atTopLevel = false ): void { '@phan-var Element|DocumentFragment $root'; // @var Element|DocumentFragment $root $urlParser = new PegTokenizer( $env ); $validContainers = []; $files = []; $containers = DOMCompat::querySelectorAll( $root, '[typeof*="mw:File"]' ); foreach ( $containers as $container ) { // DOMFragmentWrappers assume the element name of their outermost // content so, depending how the above query is written, we're // protecting against getting a figure of the wrong type. However, // since we're currently using typeof, it shouldn't be a problem. // Also note that info for the media nested in the fragment has // already been added in their respective pipeline. Assert::invariant( !WTUtils::isDOMFragmentWrapper( $container ), 'Media info for fragment was already added' ); // We expect this structure to be predictable based on how it's // emitted in the TT/WikiLinkHandler but treebuilding may have // messed that up for us. $anchor = $container; $reopenedAFE = []; do { // An active formatting element may have been reopened inside // the wrapper if a content model violation was encountered // during treebuiling. Try to be a little lenient about that // instead of bailing out $anchor = $anchor->firstChild; $anchorNodeName = DOMCompat::nodeName( $anchor ); if ( $anchorNodeName !== 'a' ) { $reopenedAFE[] = $anchor; } } while ( $anchorNodeName !== 'a' && isset( Consts::$HTML['FormattingTags'][$anchorNodeName] ) ); if ( $anchorNodeName !== 'a' ) { $env->log( 'error', 'Unexpected structure when adding media info.' ); continue; } $span = $anchor->firstChild; if ( !( $span instanceof Element && DOMCompat::nodeName( $span ) === 'span' ) ) { $env->log( 'error', 'Unexpected structure when adding media info.' ); continue; } $caption = $anchor->nextSibling; $isInlineMedia = WTUtils::isInlineMedia( $container ); if ( !$isInlineMedia && DOMCompat::nodeName( $caption ) !== 'figcaption' ) { $env->log( 'error', 'Unexpected structure when adding media info.' ); continue; } // For T314059. Migrate any active formatting tags we found open // inside the container to the ficaption to conform to the spec. // This should simplify selectors for clients and styling. // TODO: Consider exposing these as lints if ( $reopenedAFE ) { $firstAFE = $reopenedAFE[0]; $lastAFE = $reopenedAFE[count( $reopenedAFE ) - 1]; DOMUtils::migrateChildren( $lastAFE, $container ); if ( $isInlineMedia ) { // Remove the formatting elements, they are of no use // We could migrate them into the caption in data-mw, // but that doesn't seem worthwhile $firstAFE->parentNode->removeChild( $firstAFE ); } else { // Move the formatting elements into the figcaption DOMUtils::migrateChildren( $caption, $lastAFE ); $caption->appendChild( $firstAFE ); // Unconditionally clear tsr out of an abundance of caution // These tags should already be annotated as autoinserted anyways foreach ( $reopenedAFE as $afe ) { DOMDataUtils::getDataParsoid( $afe )->tsr = null; } } } $dataMw = DOMDataUtils::getDataMw( $container ); $dims = [ 'width' => (int)DOMCompat::getAttribute( $span, 'data-width' ) ?: null, 'height' => (int)DOMCompat::getAttribute( $span, 'data-height' ) ?: null, ]; $page = WTSUtils::getAttrFromDataMw( $dataMw, 'page', true ); if ( $page ) { $dims['page'] = $page->value['txt']; } $lang = DOMCompat::getAttribute( $span, 'lang' ); if ( $lang !== null ) { $dims['lang'] = $lang; } // "starttime" should be used if "thumbtime" isn't present, // but only for rendering. // "starttime" should be used if "thumbtime" isn't present, // but only for rendering. $thumbtime = WTSUtils::getAttrFromDataMw( $dataMw, 'thumbtime', true ); $starttime = WTSUtils::getAttrFromDataMw( $dataMw, 'starttime', true ); if ( $thumbtime || $starttime ) { $seek = isset( $thumbtime->value ) ? $thumbtime->value['txt'] : ( isset( $starttime->value ) ? $starttime->value['txt'] : '' ); $seek = self::parseTimeString( $seek ); if ( $seek !== null ) { $dims['seek'] = $seek; } } $attrs = [ 'dims' => $dims, 'format' => WTUtils::getMediaFormat( $container ), 'title' => $env->makeTitleFromText( $span->textContent ), ]; $file = [ $attrs['title']->getDBKey(), $dims ]; $infoKey = md5( json_encode( $file ) ); $files[$infoKey] = $file; $errs = []; $manualKey = null; $manualthumb = WTSUtils::getAttrFromDataMw( $dataMw, 'manualthumb', true ); if ( $manualthumb !== null ) { $val = $manualthumb->value['txt']; $title = $env->makeTitleFromText( $val, $attrs['title']->getNamespace(), true ); if ( $title === null ) { $errs[] = self::makeErr( 'apierror-invalidtitle', 'Invalid thumbnail title.', [ 'name' => $val ] ); } else { $file = [ $title->getDBkey(), $dims ]; $manualKey = md5( json_encode( $file ) ); $files[$manualKey] = $file; } } $validContainers[] = [ 'container' => $container, 'attrs' => $attrs, // Pass the anchor because we did some work to find it above 'anchor' => $anchor, 'infoKey' => $infoKey, 'manualKey' => $manualKey, 'errs' => $errs, ]; } if ( !$validContainers ) { return; } $start = microtime( true ); $infos = $env->getDataAccess()->getFileInfo( $env->getPageConfig(), array_values( $files ) ); if ( $env->profiling() ) { $profile = $env->getCurrentProfile(); $profile->bumpMWTime( "Media", 1000 * ( microtime( true ) - $start ), "api" ); $profile->bumpCount( "Media" ); } $files = array_combine( array_keys( $files ), $infos ); $hasThumb = false; $needsTMHModules = false; foreach ( $validContainers as $c ) { $container = $c['container']; $anchor = $c['anchor']; $span = $anchor->firstChild; $attrs = $c['attrs']; $dataMw = DOMDataUtils::getDataMw( $container ); $errs = $c['errs']; $hasThumb = $hasThumb || DOMUtils::hasTypeOf( $container, 'mw:File/Thumb' ); $info = $files[$c['infoKey']]; if ( !$info ) { $env->getDataAccess()->addTrackingCategory( $env->getPageConfig(), $env->getMetadata(), 'broken-file-category' ); $errs[] = self::makeErr( 'apierror-filedoesnotexist', 'This image does not exist.' ); } elseif ( isset( $info['thumberror'] ) ) { $errs[] = self::makeErr( 'apierror-unknownerror', $info['thumberror'] ); } // FIXME: Should we fallback to $info if there are errors with $manualinfo? // What does the legacy parser do? if ( $c['manualKey'] !== null ) { $manualinfo = $files[$c['manualKey']]; if ( !$manualinfo ) { $errs[] = self::makeErr( 'apierror-filedoesnotexist', 'This image does not exist.' ); } elseif ( isset( $manualinfo['thumberror'] ) ) { $errs[] = self::makeErr( 'apierror-unknownerror', $manualinfo['thumberror'] ); } else { $info = $manualinfo; } } if ( $info['badFile'] ?? false ) { $errs[] = self::makeErr( 'apierror-badfile', 'This image is on the bad file list.' ); } if ( WTUtils::hasVisibleCaption( $container ) ) { $captionText = null; } else { if ( WTUtils::isInlineMedia( $container ) ) { $caption = ContentUtils::createAndLoadDocumentFragment( $container->ownerDocument, $dataMw->caption ?? '' ); } else { $caption = DOMCompat::querySelector( $container, 'figcaption' ); // If the caption had tokens, it was placed in a DOMFragment // and we haven't unpacked yet if ( $caption->firstChild && DOMUtils::hasTypeOf( $caption->firstChild, 'mw:DOMFragment' ) ) { $id = DOMDataUtils::getDataParsoid( $caption->firstChild )->html; $caption = $env->getDOMFragment( $id ); } } $captionText = trim( WTUtils::textContentFromCaption( $caption ) ); // The sanitizer isn't going to do anything with a string value // for alt/title and since we're going to use dom element setters, // quote escaping should be fine. Note that if sanitization does // happen here, it should also be done to $altFromCaption so that // string comparison matches, where necessary. // // $sanitizedArgs = Sanitizer::sanitizeTagAttrs( $env->getSiteConfig(), 'img', null, [ // new KV( 'alt', $captionText ) // Could be a 'title' too // ] ); // $captionText = $sanitizedArgs['alt']->key; } // Info relates to the thumb, not necessarily the file. // The distinction matters for manualthumb, in which case only // the "resource" copied over from the span relates to the file. switch ( $info['mediatype'] ?? '' ) { case 'AUDIO': $handler = 'handleAudio'; $isImage = false; break; case 'VIDEO': $handler = 'handleVideo'; $isImage = false; break; default: $handler = 'handleImage'; $isImage = true; break; } $alt = null; $keepAltInDataMw = !$isImage || $errs; $attr = WTSUtils::getAttrFromDataMw( $dataMw, 'alt', $keepAltInDataMw ); if ( $attr !== null ) { $alt = $attr->value['txt']; } elseif ( $captionText ) { $alt = $captionText; } // Add mw:Error to the RDFa type. if ( $errs ) { self::handleErrors( $container, $span, $errs, $dataMw, $alt ); continue; } $needsTMHModules = $needsTMHModules || !$isImage; $env->getMetadata()->addImage( $attrs['title'], $info['timestamp'] ?? null, $info['sha1'] ?? null, ); $elt = self::$handler( $env, $span, $attrs, $info, $dataMw, $container, $alt ); DOMCompat::getClassList( $elt )->add( 'mw-file-element' ); $anchor = self::replaceAnchor( $env, $urlParser, $container, $anchor, $attrs, $dataMw, $isImage, $captionText, (int)( $attrs['dims']['page'] ?? 0 ), $attrs['dims']['lang'] ?? '' ); $anchor->appendChild( $elt ); if ( isset( $dataMw->attribs ) && count( $dataMw->attribs ) === 0 ) { unset( $dataMw->attribs ); } } if ( $hasThumb ) { $env->getMetadata()->appendOutputStrings( CMCSS::MODULE, [ 'mediawiki.page.media' ] ); } if ( $needsTMHModules ) { $env->getMetadata()->appendOutputStrings( CMCSS::MODULE_STYLE, [ 'ext.tmh.player.styles' ] ); $env->getMetadata()->appendOutputStrings( CMCSS::MODULE, [ 'ext.tmh.player' ] ); } } } PK ! ��v� � UpdateTemplateOutput.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\DOM\DocumentFragment; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\Utils\DOMCompat; use Wikimedia\Parsoid\Utils\DOMDataUtils; use Wikimedia\Parsoid\Utils\DOMUtils; use Wikimedia\Parsoid\Utils\PipelineUtils; use Wikimedia\Parsoid\Wt2Html\Wt2HtmlDOMProcessor; class UpdateTemplateOutput implements Wt2HtmlDOMProcessor { /** * FIXME: * -- mwt-id counter may need to be reset! * -- We have hardcoded check for Template: in English * -- We aren't checking for other instances (ex: template args) * -- We aren't checking for indirect dependencies (ex: nested templates) * -- In the core repo, we also need to figure out what OutputTransformPipeline * stages need to run in this case. * * @inheritDoc */ public function run( Env $env, Node $root, array $options = [], bool $atTopLevel = false ): void { '@phan-var Element|DocumentFragment $root'; // @var Element|DocumentFragment $root $selparData = $options['selparData'] ?? null; if ( !$selparData ) { error_log( "Missing selpar data" ); return; } // FIXME: Hardcoded for English $tplTitle = "./Template:" . $selparData->templateTitle; // FIXME: Insufficient - missing check for template args, indirect dependencies $tplNodes = DOMCompat::querySelectorAll( $root, '[typeof~="mw:Transclusion"]' ); foreach ( $tplNodes as $tplNode ) { $dataMw = DOMDataUtils::getDataMW( $tplNode ); $ti = $dataMw->parts[0] ?? null; if ( !is_string( $ti ) && $ti->href === $tplTitle ) { $dp = DOMDataUtils::getDataParsoid( $tplNode ); $wt = $dp->dsr->substr( $selparData->revText ); $opts = [ 'pipelineType' => 'selective-update-fragment-wikitext-to-dom', 'sol' => false, // FIXME: Not strictly correct 'srcText' => $selparData->revText, 'pipelineOpts' => [], 'srcOffsets' => $dp->dsr, ]; // Process template string in new pipeline $frag = PipelineUtils::processContentInPipeline( $env, $options['frame'], $wt, $opts ); // Pull out only the transclusion marked portion of $frag & strip p-wrapper $newContent = $frag->firstChild; if ( DOMCompat::nodeName( $tplNode ) !== 'p' && DOMCompat::nodeName( $newContent ) === 'p' ) { $newContent = $newContent->firstChild; } DOMDataUtils::getDataParsoid( $newContent )->dsr = $dp->dsr; // Delete template from DOM + add new content to DOM // Note that $tplNode and $frag may have more than one child in the general case $tplParent = $tplNode->parentNode; $about = DOMCompat::getAttribute( $tplNode, 'about' ); do { $next = $tplNode->nextSibling; $tplParent->removeChild( $tplNode ); $tplNode = $next; } while ( $tplNode instanceof Element && DOMCompat::getAttribute( $tplNode, 'about' ) === $about ); DOMUtils::migrateChildren( $newContent->parentNode, $tplParent, $tplNode ); } } } } PK ! �f��#5 #5 AnnotationDOMRangeBuilder.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; use SplObjectStorage; use Wikimedia\Parsoid\Core\DomSourceRange; use Wikimedia\Parsoid\DOM\Document; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\NodeData\DataParsoid; use Wikimedia\Parsoid\NodeData\TempData; use Wikimedia\Parsoid\NodeData\TemplateInfo; use Wikimedia\Parsoid\Utils\DOMCompat; use Wikimedia\Parsoid\Utils\DOMDataUtils; use Wikimedia\Parsoid\Utils\DOMUtils; use Wikimedia\Parsoid\Utils\WTUtils; use Wikimedia\Parsoid\Wt2Html\Frame; /** * The handling of annotation ranges and transclusion ranges are somewhat different for a number of reasons. * - Annotation ranges can be (and typically are) nested: we want to handle a <tvar> range inside a <translate> * range (whereas non-top-level transclusions are ignored). That said, this only applies to annotations of * different types, so finding/handling top-level ranges of a given type is useful (hence extending the * DOMRangeBuilder, still.) * * - Annotation ranges are not represented in the final document in the same way as transclusions. In an ideal * world, annotations are well-nested and the corresponding range is not extended; in this case, the annotation * range is only delimited by a pair of <meta> tags (that can then be displayed by VE, or ignored by * read-views). The annotated content stays editable; whereas editing of templated content is always prevented. * * - Relatedly: annotation meta tags are NOT removed from the output (whereas transclusion meta tags are an * intermediary state). This has an impact on fostering. It is safe to bypass the fostering of meta tags in the * template case, because the meta tags will disappear anyway, and their presence in a fostering position only * marks the whole table as template content. Annotation tags do not benefit from the same leeway: they will need * to be moved in the right place (and, for end tags, "the right place" means the end of the table, not the start * of the table - which we can handle more consistently if the meta tag ends up in the FosterBox). Hence, * there is little reason to not use the general fostering pass for annotation meta tags as well (except for * the consistency with transclusion meta tags). * * The assumptions here are consequently as follows: * - annotation <meta> tags are not in a fosterable position (they have been moved out of it in the * TreeBuilderStage) * - during the MarkFosteredContent pass, end annotation meta tags are moved from the foster box to after the * table. * This should guarantee that no range is reversed (so that's a case we do not have to worry about). */ class AnnotationDOMRangeBuilder extends DOMRangeBuilder { /** @var MigrateTrailingNLs */ private $migrateTrailingNls; /** * AnnotationDOMRangeBuilder constructor. * @param Document $document * @param Frame $frame */ public function __construct( Document $document, Frame $frame ) { parent::__construct( $document, $frame ); $this->traceType = "annwrap"; $this->migrateTrailingNls = new MigrateTrailingNLs(); } private function wrapAnnotationsInTree( array $annRanges ): void { foreach ( $annRanges as $range ) { if ( $range->startElem !== $range->start ) { $this->moveRangeStart( $range, $range->start ); } if ( $range->endElem !== $range->end ) { $this->moveRangeEnd( $range, $range->end ); } // It can happen that marking range uneditable adds another layer of nesting that is not captured // by the initial range detection (since it's not there at that time). To avoid that, we check whether // both nodes have the same parent and, if not, we hoist them to a common ancestor. $startParent = DOMCompat::getParentElement( $range->start ); $endParent = DOMCompat::getParentElement( $range->end ); if ( $startParent !== $endParent ) { // Post-moves above, start/end have been set to the respective metas $correctedRange = self::findEnclosingRange( $range->start, $range->end ); if ( $range->start !== $correctedRange->start ) { $this->moveRangeStart( $range, $correctedRange->start ); } if ( $range->end !== $correctedRange->end ) { $this->moveRangeEnd( $range, $correctedRange->end ); } } } } /** * Makes the DOM range between $range->startElem and $range->endElem uneditable by wrapping * it into a <div> (for block ranges) or <span> (for inline ranges) with the mw:ExtendedAnnRange * type. * @param DOMRangeInfo $range */ private function makeUneditable( DOMRangeInfo $range ) { $startMeta = $range->startElem; $endMeta = $range->endElem; $actualRangeStart = DOMDataUtils::getDataParsoid( $startMeta )->dsr->start; $actualRangeEnd = DOMDataUtils::getDataParsoid( $endMeta )->dsr->end; $inline = true; $node = $startMeta; while ( true ) { if ( $node === null ) { // Start and end aren't siblings, we'll log an error below break; } if ( DOMUtils::hasBlockTag( $node ) ) { $inline = false; break; } if ( $node === $endMeta ) { break; } $node = $node->nextSibling; } $wrap = $startMeta->ownerDocument->createElement( $inline ? 'span' : 'div' ); $wrap->setAttribute( "typeof", "mw:ExtendedAnnRange" ); $startMeta->parentNode->insertBefore( $wrap, $startMeta ); $node = $startMeta; while ( true ) { if ( $node === null ) { $this->env->log( 'warn', "End of annotation range [$actualRangeStart, $actualRangeEnd] not found. " . "Document marked uneditable until its end." ); break; } $next = $node->nextSibling; $wrap->appendChild( $node ); if ( $node === $endMeta ) { break; } $node = $next; } // Ensure template continuity is not broken // FIXME: What about if the endMeta has an about id? Even though // annotations don't come from template, template ranges can subsume // them by adding strings to their "parts". $about = DOMCompat::getAttribute( $startMeta, "about" ); $previousElt = DOMCompat::getPreviousElementSibling( $startMeta ); $nextElt = DOMCompat::getNextElementSibling( $endMeta ); $continuity = ( ( $previousElt && $previousElt->hasAttribute( "about" ) ) || ( $nextElt && $nextElt->hasAttribute( "about" ) ) ); if ( $about && $continuity ) { $wrap->setAttribute( "about", $about ); } // FIXME: If we're adding an about id, we need to fixup the dsr // on the template to include any range we may be adding. $dp = new DataParsoid(); $dp->autoInsertedStart = true; $dp->autoInsertedEnd = true; $dp->dsr = new DomSourceRange( $actualRangeStart, $actualRangeEnd, 0, 0 ); DOMDataUtils::setDataParsoid( $wrap, $dp ); } /** * Moves the start of the range to the designated node * @param DOMRangeInfo $range the range to modify * @param Node $node the new start of the range */ private function moveRangeStart( DOMRangeInfo $range, Node $node ): void { $startMeta = $range->startElem; $startDataParsoid = DOMDataUtils::getDataParsoid( $startMeta ); if ( $node instanceof Element ) { if ( DOMCompat::nodeName( $node ) === "p" && $node->firstChild === $startMeta ) { // If the first child of "p" is the meta, and it gets moved, then it got mistakenly // pulled inside the paragraph, and the paragraph dsr that gets computed includes // it - which may lead to the tag getting duplicated on roundtrip. Hence, we // adjust the dsr of the paragraph in that case. We also don't consider the meta // tag to have been moved in that case. $pDataParsoid = DOMDataUtils::getDataParsoid( $node ); $pDataParsoid->dsr->start = $startDataParsoid->dsr->end; } else { $startDataParsoid->wasMoved = true; } } $node = $this->getStartConsideringFosteredContent( $node ); $node->parentNode->insertBefore( $startMeta, $node ); if ( $node instanceof Element ) { // Ensure template continuity is not broken $about = DOMCompat::getAttribute( $node, "about" ); if ( $about !== null ) { $startMeta->setAttribute( "about", $about ); } } $range->start = $startMeta; } /** * Moves the start of the range to the designated node * @param DOMRangeInfo $range the range to modify * @param Node $node the new start of the range */ private function moveRangeEnd( DOMRangeInfo $range, Node $node ): void { $endMeta = $range->endElem; $endDataParsoid = DOMDataUtils::getDataParsoid( $endMeta ); if ( $node instanceof Element ) { $endMetaWasLastChild = $node->lastChild === $endMeta; // Migrate $endMeta and ensure template continuity is not broken $node->parentNode->insertBefore( $endMeta, $node->nextSibling ); $about = DOMCompat::getAttribute( $node, "about" ); if ( $about !== null ) { $endMeta->setAttribute( "about", $about ); } if ( ( DOMCompat::nodeName( $node ) === "p" ) && $endMetaWasLastChild ) { // If the last child of "p" is the meta, and it gets moved, then it got mistakenly // pulled inside the paragraph, and the paragraph dsr that gets computed includes // it - which may lead to the tag getting duplicated on roundtrip. Hence, we // adjust the dsr of the paragraph in that case. We also don't consider the meta // tag to have been moved in that case. $pDataParsoid = DOMDataUtils::getDataParsoid( $node ); $pDataParsoid->dsr->end = $endDataParsoid->dsr->start; $prevLength = strlen( $node->textContent ?? '' ); $this->migrateTrailingNls->doMigrateTrailingNLs( $node, $this->env ); $newLength = strlen( $node->textContent ?? '' ); if ( $prevLength != $newLength ) { $pDataParsoid->dsr->end -= ( $prevLength - $newLength ); } } else { $endDataParsoid->wasMoved = true; DOMDataUtils::setDataParsoid( $endMeta, $endDataParsoid ); } } $range->end = $endMeta; } /** * Returns whether one of the ends of the range has been moved, which corresponds to an extended * range. * @param DOMRangeInfo $range * @return bool */ private function isExtended( DOMRangeInfo $range ): bool { if ( $range->extendedByOverlapMerge ) { return true; } $startDataParsoid = DOMDataUtils::getDataParsoid( $range->startElem ); $endDataParsoid = DOMDataUtils::getDataParsoid( $range->endElem ); return ( $startDataParsoid->wasMoved ?? false ) || ( $endDataParsoid->wasMoved ?? false ); } /** * Sets the data-mw attribute for meta tags of the provided range * @param DOMRangeInfo $range range whose start and end element needs to be to modified * @param bool $isExtended whether the range got extended */ private function setMetaDataMwForRange( DOMRangeInfo $range, bool $isExtended ): void { $startDataMw = DOMDataUtils::getDataMw( $range->startElem ); $endDataMw = DOMDataUtils::getDataMw( $range->endElem ); $startDataMw->extendedRange = $isExtended; $startDataMw->wtOffsets = DOMDataUtils::getDataParsoid( $range->startElem )->tsr; $endDataMw->wtOffsets = DOMDataUtils::getDataParsoid( $range->endElem )->tsr; unset( $endDataMw->rangeId ); } /** * Returns the meta type of the element if it exists and matches the type expected by the * current class, null otherwise * @param Element $elem the element to check * @return string|null */ protected function matchMetaType( Element $elem ): ?string { // for this class we're interested in the annotation type return WTUtils::matchAnnotationMeta( $elem ); } /** @inheritDoc */ protected function verifyTplInfoExpectation( ?TemplateInfo $templateInfo, TempData $tmp ): void { // Annotations aren't templates. Nothing to do. } /** * Returns the range ID of a node - in the case of annotations, the "rangeId" property * of its "data-mw" attribute. * @param Element $node * @return string */ protected function getRangeId( Element $node ): string { return DOMDataUtils::getDataMw( $node )->rangeId ?? ''; } /** * @inheritDoc */ protected function updateDSRForFirstRangeNode( Element $target, Element $source ): void { // nop } public function execute( Node $root ): void { try { $annRanges = $this->findWrappableMetaRanges( $root ); } catch ( RangeBuilderException $e ) { $this->env->log( 'warn', 'The annotation ranges could not be fully detected. ' . ' Annotation processing cancelled. ' ); return; } $rangesByType = []; foreach ( $annRanges as $range ) { $annType = WTUtils::extractAnnotationType( $range->startElem ); $rangesByType[$annType] ??= []; $rangesByType[$annType][] = $range; } foreach ( $rangesByType as $singleTypeRanges ) { // FIXME: The ranges in $singleTypeRanges may have start/end that // are no longer siblings because of the wrapping in makeUneditable. // wrapAnnotationsInTree tries to account for that by calling // by redoing findEnclosingRange but that happens after // findTopLevelNonOverlappingRanges, which may rely on the assumption // of a linear range, further analysis is needed. // // Furthermore, makeUneditable may be messing up any ranges we've // already processed of other types since those aren't guaranteed // to be non-overlapping of the current type. $this->nodeRanges = new SplObjectStorage; $topRanges = $this->findTopLevelNonOverlappingRanges( $root, $singleTypeRanges ); $this->wrapAnnotationsInTree( $topRanges ); foreach ( $topRanges as $range ) { $isExtended = $this->isExtended( $range ); if ( $isExtended ) { $this->makeUneditable( $range ); } $this->setMetaDataMwForRange( $range, $isExtended ); } } } } PK ! Re4[~ [~ WrapSectionsState.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; use Wikimedia\Assert\Assert; use Wikimedia\Assert\UnreachableException; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\Core\DomSourceRange; use Wikimedia\Parsoid\Core\InternalException; use Wikimedia\Parsoid\Core\SectionMetadata; use Wikimedia\Parsoid\DOM\Comment; use Wikimedia\Parsoid\DOM\Document; use Wikimedia\Parsoid\DOM\DocumentFragment; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\DOM\Text; use Wikimedia\Parsoid\NodeData\DataMw; use Wikimedia\Parsoid\NodeData\DataParsoid; use Wikimedia\Parsoid\NodeData\TemplateInfo; use Wikimedia\Parsoid\Utils\DOMCompat; use Wikimedia\Parsoid\Utils\DOMDataUtils; use Wikimedia\Parsoid\Utils\DOMUtils; use Wikimedia\Parsoid\Utils\PHPUtils; use Wikimedia\Parsoid\Utils\TokenUtils; use Wikimedia\Parsoid\Utils\Utils; use Wikimedia\Parsoid\Utils\WTUtils; use Wikimedia\Parsoid\Wt2Html\Frame; class WrapSectionsState { private Env $env; private Frame $frame; /** @var Element|DocumentFragment */ private $rootNode; /** * The next section debug ID */ private int $count = 1; /** * Pseudo section count is needed to determine TOC rendering */ private int $pseudoSectionCount = 0; private Document $doc; /** * Map of about ID to first element * @var Element[] */ private array $aboutIdMap = []; private int $sectionNumber = 0; private ?WrapSectionsTplInfo $tplInfo = null; /** @var WrapSectionsTplInfo[] */ private array $tplsAndExtsToExamine = []; private int $oldLevel = 0; public function __construct( Env $env, Frame $frame, Node $rootNode ) { $this->env = $env; $this->frame = $frame; $this->rootNode = $rootNode; $this->doc = $rootNode->ownerDocument; } /** * Update section metadata needed to generate TOC. * * @param SectionMetadata $metadata * @param Element $heading * @param int $newLevel */ private function computeSectionMetadata( SectionMetadata $metadata, Element $heading, int $newLevel ): void { if ( !$this->env->getPageConfig()->getSuppressTOC() ) { $tocData = $this->env->getTOCData(); $tocData->addSection( $metadata ); $tocData->processHeading( $this->oldLevel, $newLevel, $metadata ); } $this->oldLevel = $newLevel; if ( WTUtils::isLiteralHTMLNode( $heading ) ) { // Literal HTML tags in wikitext don't get section edit links $metadata->fromTitle = null; $metadata->index = ''; $metadata->codepointOffset = null; } elseif ( $this->tplInfo !== null ) { $dmw = DOMDataUtils::getDataMw( $this->tplInfo->first ); $metadata->index = ''; // Match legacy parser if ( !isset( $dmw->parts ) ) { // Extension or language-variant // Need to determine what the output should be here $metadata->fromTitle = null; } elseif ( count( $dmw->parts ) > 1 ) { // Multi-part content -- cannot pick a title $metadata->fromTitle = null; } else { $p0 = $dmw->parts[0]; if ( !( $p0 instanceof TemplateInfo ) ) { throw new UnreachableException( "a single part will always be a TemplateInfo not a string" ); } if ( $p0->type === 'templatearg' ) { // Since we currently don't process templates in Parsoid, // this has to be a top-level {{{...}}} and so the content // comes from the current page. But, legacy parser returns 'false' // for this, so we'll return null as well instead of current title. $metadata->fromTitle = null; } elseif ( !empty( $p0->href ) ) { // Pick template title, but strip leading "./" prefix $tplHref = Utils::decodeURIComponent( $p0->href ); $metadata->fromTitle = PHPUtils::stripPrefix( $tplHref, './' ); if ( $this->sectionNumber >= 0 ) { // Legacy parser sets this to '' in some cases // See "Templated sections (heading from template arg)" parser test $metadata->index = 'T-' . $this->sectionNumber; } } else { // Legacy parser return null here $metadata->fromTitle = null; } } $metadata->codepointOffset = null; } else { $title = $this->env->getContextTitle(); // Use the dbkey (underscores) instead of text (spaces) $metadata->fromTitle = $title->getPrefixedDBKey(); $metadata->index = (string)$this->sectionNumber; // Note that our DSR counts *are* byte counts, while this core // interface expects *codepoint* counts. We are going to convert // these in a batch (for efficiency) in ::convertTOCOffsets() below $metadata->codepointOffset = DOMDataUtils::getDataParsoid( $heading )->dsr->start ?? -1; } $metadata->anchor = DOMCompat::getAttribute( $heading, 'id' ); $section = DOMDataUtils::getDataParsoid( $heading )->getTemp()->section; $metadata->line = $section['line']; $metadata->linkAnchor = $section['linkAnchor']; } /** * Should we omit this heading from TOC? * Yes if $heading is: * - generated by an extensoin */ private function shouldOmitFromTOC( Element $heading ): bool { $node = $heading->parentNode; while ( $node ) { // NOTE: Here, we are making the assumption that extensions never // emit a DOM forest and only ever have a single wrapper node. // While ExtensionHandler doesn't assume that, this seems to be borne out // in reality. But, if this assumption were not true, we would be adding // TOC entries from extension-generated about siblings into the TOC. // In scenarios where templates generated the extension and the extension // is part of the template's wrapper, we cannot reliably determine what // part of the output came from extensions in that case (because the // template wrapping clobbers that information). So, for now, we ignore // this edge case where extensions generate multiple DOM nodes (that also // have headings). Later on, we may enforce a single-wrapper-node // requirement for extensions. if ( WTUtils::isFirstExtensionWrapperNode( $node ) ) { return true; } $node = $node->parentNode; } return false; } /** * Create a new section element * * @param Element|DocumentFragment $rootNode * @param array<Section> &$sectionStack * @param ?Section $currSection * @param Element $heading the heading node * @param int $newLevel * @param bool $pseudoSection * @return Section */ private function createNewSection( Node $rootNode, array &$sectionStack, ?Section $currSection, Element $heading, int $newLevel, bool $pseudoSection ): Section { /* Structure for regular (editable or not) sections * <section data-mw-section-id=".."> * <h*>..</h*> * .. * </section> * * Lead sections and pseudo-sections won't have <h*> or <div> tags */ $section = new Section( $newLevel, $this->count++, $this->doc ); /* Step 1. Get section stack to the right nesting level * 1a. Pop stack till we have a higher-level section. */ $stack = &$sectionStack; $sc = count( $stack ); while ( $sc > 0 && !( $stack[$sc - 1]->hasNestedLevel( $newLevel ) ) ) { array_pop( $stack ); $sc--; } /* 1b. Push current section onto stack if it is a higher-level section */ if ( $currSection && $currSection->hasNestedLevel( $newLevel ) ) { $stack[] = $currSection; $sc++; } /* Step 2: Add new section where it belongs: a parent section OR body */ $parentSection = $sc > 0 ? $stack[$sc - 1] : null; if ( $parentSection ) { $parentSection->addSection( $section ); } else { $rootNode->insertBefore( $section->container, $heading ); } /* Step 3: Add <h*> to the <section> */ $section->addNode( $heading ); /* Step 4: Assign data-mw-section-id attribute * * CX wants <section> tags with a distinguishing attribute so that * it can differentiate between its internal use of <section> tags * with what Parsoid adds. So, we will add a data-mw-section-id * attribute always. * * data-mw-section-id = 0 for the lead section * data-mw-section-id = -1 for non-editable sections * Note that templated content cannot be edited directly. * data-mw-section-id = -2 for pseudo sections * data-mw-section-id > 0 for everything else and this number * matches PHP parser / MediaWiki's notion of that section. * * The code here handles uneditable sections because of templating. */ if ( $pseudoSection ) { $this->pseudoSectionCount++; $section->setId( -2 ); } elseif ( $this->tplInfo !== null ) { $section->setId( -1 ); } else { $section->setId( $this->sectionNumber ); } // Sections from extensions shouldn't show up in TOC if ( !$pseudoSection && !$this->shouldOmitFromTOC( $heading ) ) { $this->computeSectionMetadata( $section->metadata, $heading, $newLevel ); } return $section; } private function isEmptySpan( Element $span ): bool { $n = $span->firstChild; while ( $n ) { if ( $n instanceof Element ) { return false; } elseif ( $n instanceof Text && !preg_match( '/^\s*$/D', $n->nodeValue ) ) { return false; } $n = $n->nextSibling; } return true; } /** * Walk the DOM and add <section> wrappers where required. * This is the workhorse code that wrapSections relies on. * * @param ?Section $currSection * @param Element|DocumentFragment $rootNode * @return int */ private function wrapSectionsInDOM( ?Section $currSection, Node $rootNode ): int { // Since template wrapping is done and template wrappers are well-nested, // we can reset template state for every subtree. $tplInfo = null; $sectionStack = []; $highestSectionLevel = 7; $node = $rootNode->firstChild; while ( $node ) { $next = $node->nextSibling; $addedNode = false; $expandSectionBoundary = false; // Track entry into templated and extension output if ( !$this->tplInfo && WTUtils::isFirstEncapsulationWrapperNode( $node ) ) { DOMUtils::assertElt( $node ); $this->tplInfo = $tplInfo = new WrapSectionsTplInfo; $tplInfo->first = $node; $about = DOMCompat::getAttribute( $node, 'about' ); // NOTE: could be null because of language variant markup! $tplInfo->about = $about; $aboutSiblings = WTUtils::getAboutSiblings( $node, $about ); $tplInfo->last = end( $aboutSiblings ); $this->aboutIdMap[$about] = $node; // Collect a sequence of rendering transparent nodes starting at $node. // This could be while ( true ), but being defensive. while ( $node ) { // If we hit the end of the template, we are done! // - If this is a heading, we'll process it below. // - If not, the template never had a heading, so // we can continue default section wrapping behavior. if ( $tplInfo->last === $node ) { break; } // If we hit a non-rendering-transparent node or a non-empty span, // we are done! We cannot expand the section boundary any further. if ( !WTUtils::isRenderingTransparentNode( $node ) && !( DOMCompat::nodeName( $node ) === 'span' && !WTUtils::isLiteralHTMLNode( $node ) && $this->isEmptySpan( $node ) ) ) { break; } // Accumulate the rendering-transparent node and loop $tplInfo->rtContentNodes[] = $node; $node = $node->nextSibling; } if ( count( $tplInfo->rtContentNodes ) > 0 && DOMUtils::isHeading( $node ) ) { // In this scenario, we can expand the section boundary to include these nodes // rather than start with the heading. This eliminates unnecessary conflicts // between section & template boundaries. $expandSectionBoundary = true; $next = $node->nextSibling; } else { // Reset to normal sectioning behavior! $node = $tplInfo->first; $tplInfo->rtContentNodes = []; } } if ( DOMUtils::isHeading( $node ) ) { DOMUtils::assertElt( $node ); // headings are elements $level = (int)DOMCompat::nodeName( $node )[1]; $dp = DOMDataUtils::getDataParsoid( $node ); if ( WTUtils::isLiteralHTMLNode( $node ) ) { // HTML <h*> tags get section wrappers, but the sections are uneditable // via the section editing API. $this->sectionNumber = -1; } elseif ( isset( $dp->tmp->headingIndex ) ) { // This could be just `$this->sectionNumber++` without the // complicated if-guard if T214538 were fixed in core; // see T213468 where this more-complicated behavior was // added to match core's eccentricities. $this->sectionNumber = $dp->tmp->headingIndex; } if ( $level < $highestSectionLevel ) { $highestSectionLevel = $level; } $currSection = $this->createNewSection( $rootNode, $sectionStack, $currSection, $node, $level, false ); if ( $tplInfo && $expandSectionBoundary ) { foreach ( $tplInfo->rtContentNodes as $rtn ) { $currSection->container->insertBefore( $rtn, $node ); } $tplInfo->firstSection = $currSection; } $addedNode = true; } elseif ( $node instanceof Element ) { $nestedHighestSectionLevel = $this->wrapSectionsInDOM( null, $node ); if ( $currSection && !$currSection->hasNestedLevel( $nestedHighestSectionLevel ) ) { // If we find a higher level nested section, // (a) Make current section non-editable // (b) There are 2 options here best illustrated with an example. // Consider the wiktiext below. // <div> // =1= // b // </div> // c // =2= // 1. Create a new pseudo-section to wrap '$node' // There will be a <section> around the <div> which includes 'c'. // 2. Don't create the pseudo-section by setting '$currSection = null' // But, this can leave some content outside any top-level section. // 'c' will not be in any section. // The code below implements strategy 1. $currSection->setId( -1 ); $currSection = $this->createNewSection( $rootNode, $sectionStack, $currSection, $node, $nestedHighestSectionLevel, true ); $addedNode = true; } } if ( $currSection && !$addedNode ) { $currSection->addNode( $node ); } if ( $tplInfo && $tplInfo->first === $node ) { $tplInfo->firstSection = $currSection; } // Track exit from templated output if ( $tplInfo && $tplInfo->last === $node ) { if ( $currSection !== $tplInfo->firstSection ) { // The opening $node and closing $node of the template // are in different sections! This might require resolution. // While 'firstSection' could be null, if we get here, // 'lastSection' is guaranteed to always be non-null. $tplInfo->lastSection = $currSection; $this->tplsAndExtsToExamine[] = $tplInfo; } $this->tplInfo = $tplInfo = null; } $node = $next; } // The last section embedded in a non-body DOM element // should always be marked non-editable since it will have // the closing tag (ex: </div>) showing up in the source editor // which we cannot support in a visual editing $environment. if ( $currSection && !DOMUtils::atTheTop( $rootNode ) ) { $currSection->setId( -1 ); } return $highestSectionLevel; } /** * Is this a Parsoid-inserted section (vs. a section node generated by * other page-components / content-generators like extensions)? * * @param Element $n * @return bool */ private static function isParsoidSection( Element $n ): bool { return DOMCompat::nodeName( $n ) === 'section' && $n->hasAttribute( 'data-mw-section-id' ); } /** * Find an ancestor that is a Parsoid-inserted section * * @param Node $n * @return Element */ private static function findSectionAncestor( Node $n ): Element { do { $n = DOMUtils::findAncestorOfName( $n, 'section' ); } while ( $n && !self::isParsoidSection( $n ) ); Assert::invariant( $n instanceof Element, "Expected to find Parsoid-section ancestor" ); return $n; } /** * Get opening/closing DSR offset for the subtree rooted at $node. * This handles scenarios where $node is a section or template wrapper * and if a section, when it has leading/trailing non-element nodes * that don't have recorded DSR values. * * @param Element $node * @param bool $start * @return ?int */ private function getDSR( Element $node, bool $start ): ?int { if ( !self::isParsoidSection( $node ) ) { $dsr = DOMDataUtils::getDataParsoid( $node )->dsr ?? null; if ( !$dsr ) { Assert::invariant( $node->hasAttribute( 'about' ), 'Expected an about id' ); $about = DOMCompat::getAttribute( $node, 'about' ); $dsr = DOMDataUtils::getDataParsoid( $this->aboutIdMap[$about] )->dsr; } return $start ? $dsr->start : $dsr->end; } $offset = 0; $c = $start ? $node->firstChild : $node->lastChild; while ( $c ) { if ( $c instanceof Text ) { $offset += strlen( $c->textContent ); } elseif ( $c instanceof Comment ) { $offset += WTUtils::decodedCommentLength( $c ); } else { DOMUtils::assertElt( $c ); $ret = $this->getDSR( $c, $start ); return $ret === null ? null : $ret + ( $start ? -$offset : $offset ); } $c = $start ? $c->nextSibling : $c->previousSibling; } return -1; } /** * FIXME: Duplicated with TableFixups code. * @param list<string|TemplateInfo> &$parts * @param ?int $offset1 * @param ?int $offset2 * @throws InternalException */ private function fillDSRGap( array &$parts, ?int $offset1, ?int $offset2 ): void { if ( $offset1 === null || $offset2 === null ) { throw new InternalException(); } if ( $offset1 < $offset2 ) { $parts[] = PHPUtils::safeSubstr( $this->frame->getSrcText(), $offset1, $offset2 - $offset1 ); } } /** * FIXME: There is strong overlap with TableFixups code. * * $wrapper will hold tpl/ext encap info for the array of tpls/exts as well as * content before, after and in between them. Right now, this will always be a * <section> node, but not asserting this since code doesn't depend on it being so. * * @param Element $wrapper * @param array $encapWrappers */ private function collapseWrappers( Element $wrapper, array $encapWrappers ): void { $wrapperDp = DOMDataUtils::getDataParsoid( $wrapper ); // Build up $parts, $pi to set up the combined transclusion info on $wrapper $parts = []; $pi = []; $index = 0; $prevDp = null; $haveTemplate = false; try { foreach ( $encapWrappers as $encapNode ) { $dp = DOMDataUtils::getDataParsoid( $encapNode ); // Plug DSR gaps between encapWrappers if ( !$prevDp ) { $this->fillDSRGap( $parts, $wrapperDp->dsr->start, $dp->dsr->start ); } else { $this->fillDSRGap( $parts, $prevDp->dsr->end, $dp->dsr->start ); } if ( DOMUtils::hasTypeOf( $encapNode, "mw:Transclusion" ) ) { $haveTemplate = true; // Assimilate $encapNode's data-mw and data-parsoid pi info $dmw = DOMDataUtils::getDataMw( $encapNode ); foreach ( $dmw->parts ?? [] as $part ) { // Template index is relative to other transclusions. // This index is used to extract whitespace information from // data-parsoid and that array only includes info for templates. // So skip over strings here. if ( !is_string( $part ) ) { $part = clone $part; $part->i = $index++; } $parts[] = $part; } PHPUtils::pushArray( $pi, $dp->pi ?? [ [] ] ); } else { // Where a non-template type is present, we are going to treat that // segment as a "string" in the parts array. So, we effectively treat // "mw:Transclusion" as a generic type that covers a single template // as well as a run of segments where at least one segment comes from // a template but others may be from other generators (ex: extensions). $this->fillDSRGap( $parts, $dp->dsr->start, $dp->dsr->end ); } $prevDp = $dp; } if ( !$haveTemplate ) { throw new InternalException(); } DOMUtils::addTypeOf( $wrapper, "mw:Transclusion" ); $wrapperDp->pi = $pi; $this->fillDSRGap( $parts, $prevDp->dsr->end, $wrapperDp->dsr->end ); $dataMw = new DataMw( [] ); $dataMw->parts = $parts; DOMDataUtils::setDataMw( $wrapper, $dataMw ); } catch ( InternalException $e ) { // We don't have accurate template wrapping information. // Set typeof to 'mw:Placeholder' since 'mw:Transclusion' // typeof is not actionable without valid data-mw. // // FIXME: // 1. If we stop stripping section wrappers in the html->wt direction, // we will need to add a DOMHandler for <section> or mw:Placeholder typeof // on arbitrary Elements to traverse into children and serialize and // prevent page corruption. // 2. This may be a good place to collect stats for T191641#6357136 // 3. Maybe we need a special error typeof rather than mw:Placeholder $wrapper->setAttribute( 'typeof', 'mw:Placeholder' ); } } /** * Section wrappers and encapsulation wrappers can conflict because of * partial overlaps. This method identifies those conflicts and fixes up * the encapsulation by expanding those ranges as necessary. */ private function resolveTplExtSectionConflicts(): void { $secRanges = []; '@phan-var array[] $secRanges'; foreach ( $this->tplsAndExtsToExamine as $tplInfo ) { $s1 = $tplInfo->firstSection->container ?? self::findSectionAncestor( $tplInfo->first ); // guaranteed to be non-null $s2 = $tplInfo->lastSection->container; // Find a common ancestor of s1 and s2 (could be s1 or s2) $s2Ancestors = DOMUtils::pathToRoot( $s2 ); $s1Ancestors = []; $n = 0; $ancestor = $s1; while ( !in_array( $ancestor, $s2Ancestors, true ) ) { $s1Ancestors[] = $ancestor; $ancestor = $ancestor->parentNode; $n++; } // ancestor is now the common ancestor of s1 and s2 $s1Ancestors[] = $ancestor; $n++; // Set up start/end of the new encapsulation range if ( $ancestor === $s1 || $ancestor === $s2 ) { $start = $ancestor; $end = $ancestor; } else { // While creating a new section (see createNewSection), it only // gets added where its parent is either another section, // or body, so all ancestors are themselves sections, or body. $start = $s1Ancestors[$n - 2]; $i = array_search( $ancestor, $s2Ancestors, true ); $end = $s2Ancestors[$i - 1]; } '@phan-var Element $start'; // @var Element $start '@phan-var Element $end'; // @var Element $end // Add new OR update existing range if ( $start->hasAttribute( 'about' ) ) { // Overlaps with an existing range. $about = DOMCompat::getAttribute( $start, 'about' ); if ( !$end->hasAttribute( 'about' ) ) { // Extend existing range till $end $secRanges[$about]['end'] = $end; $end->setAttribute( 'about', $about ); } else { Assert::invariant( DOMCompat::getAttribute( $end, 'about' ) === $about, "Expected end-range about id to be $about instead of " . DOMCompat::getAttribute( $end, 'about' ) . " in the overlap scenario." ); } } else { // Check for nesting in another range. Since $start and $end // are siblings, this is sufficient to know the entire range // is nested $about = null; $n = $start->parentNode; $body = DOMCompat::getBody( $start->ownerDocument ); while ( $n !== $body ) { '@phan-var Element $n'; // @var Element $n if ( self::isParsoidSection( $n ) && $n->hasAttribute( 'about' ) ) { $about = DOMCompat::getAttribute( $n, 'about' ); break; } $n = $n->parentNode; } if ( !$about ) { // Not overlapping, not nested => new range $about = $this->env->newAboutId(); $start->setAttribute( 'about', $about ); $end->setAttribute( 'about', $about ); $secRanges[$about] = [ 'start' => $start, 'end' => $end, 'encapWrappers' => [] ]; } } $secRanges[$about]['encapWrappers'][] = $tplInfo->first; } // Process recorded ranges into new encapsulation information // that spans all content in that range. foreach ( $secRanges as $about => $range ) { // Ensure that all top level nodes of the range have the same about id for ( $n = $range['start']; $n !== $range['end']->nextSibling; $n = $n->nextSibling ) { Assert::invariant( self::isParsoidSection( $n ), "Encountered non-Parsoid-section node (" . DOMCompat::nodeName( $n ) . ") while updating template wrappers" ); $n->setAttribute( 'about', $about ); } $dsr1 = $this->getDSR( $range['start'], true ); // Traverses non-tpl content => will succeed $dsr2 = $this->getDSR( $range['end'], false ); // Traverses non-tpl content => will succeed $dp = new DataParsoid; $dp->dsr = new DomSourceRange( $dsr1, $dsr2, null, null ); DOMDataUtils::setDataParsoid( $range['start'], $dp ); $this->collapseWrappers( $range['start'], $range['encapWrappers'] ); } } private function convertTOCOffsets() { // Create reference array from all the codepointOffsets $offsets = []; foreach ( $this->env->getTOCData()->getSections() as $section ) { if ( $section->codepointOffset !== null ) { $offsets[] = &$section->codepointOffset; } } TokenUtils::convertOffsets( $this->env->topFrame->getSrcText(), $this->env->getCurrentOffsetType(), 'char', $offsets ); } /** * In core, Parser.php adds a TOC marker before the *first* heading element * independent of how that heading element is nested. In the common case, * that insertion point corresponds to the last element of the lead section * as computed by section wrapping code in this file. In the edge case, when * a <div> wraps the heading, the insertion point lies inside the <div> and * has no relation to the lead section. */ private static function findTOCInsertionPoint( Node $elt ): ?Element { while ( $elt ) { // Ignore extension content while finding TOC insertion point if ( WTUtils::isFirstExtensionWrapperNode( $elt ) ) { $elt = WTUtils::skipOverEncapsulatedContent( $elt ); continue; } if ( $elt instanceof Element ) { if ( DOMUtils::isHeading( $elt ) ) { return $elt; } elseif ( $elt->firstChild ) { $tocIP = self::findTOCInsertionPoint( $elt->firstChild ); if ( $tocIP ) { return $tocIP; } } } $elt = $elt->nextSibling; } return null; } /** * Insert a synthetic section in which to place the TOC */ private function insertSyntheticSection( Element $syntheticTocMeta, Element $insertionPoint ): Element { $prev = $insertionPoint->previousSibling; // Create a pseudo-section contaning the TOC $syntheticTocSection = $this->doc->createElement( 'section' ); $syntheticTocSection->setAttribute( 'data-mw-section-id', '-2' ); $insertionPoint->parentNode->insertBefore( $syntheticTocSection, $insertionPoint ); $this->pseudoSectionCount++; $syntheticTocSection->appendChild( $syntheticTocMeta ); // Ensure template continuity is not broken! // If $prev is not an encapsulation wrapper, nothing to do! if ( $prev && WTUtils::isEncapsulationWrapper( $prev ) ) { '@phan-var Element $prev'; $prevAbout = DOMCompat::getAttribute( $prev, 'about' ); // First, handle the case of section-tag-stripping that VE does. // So, find the leftmost non-section-wrapper node since we want // If the about ids are different, $next & $prev belong to // different transclusions and the TOC meta can be left alone. $next = $insertionPoint->firstChild; $nextAbout = $next instanceof Element ? DOMCompat::getAttribute( $next, 'about' ) : null; if ( $prevAbout === $nextAbout ) { $syntheticTocMeta->setAttribute( 'about', $prevAbout ); } // Now handle case of section-tags not being stripped // NOTE that $syntheticMeta is before $insertipnPoint // If it is not-null, it is known to be a <section>. $next = $insertionPoint; '@phan-var Element $next'; $nextAbout = $next ? DOMCompat::getAttribute( $next, 'about' ) : null; if ( $prevAbout === $nextAbout ) { $syntheticTocSection->setAttribute( 'about', $prevAbout ); } } return $syntheticTocSection; } private function addSyntheticTOCMarker(): void { // Add a synthetic TOC at the end of the first section, if necessary $tocBS = $this->env->getBehaviorSwitch( 'toc' ); $noTocBS = $this->env->getBehaviorSwitch( 'notoc' ); $forceTocBS = $this->env->getBehaviorSwitch( 'forcetoc' ); $showToc = true; if ( $noTocBS && !$tocBS ) { $showToc = false; } $numHeadings = $this->count - 1 - $this->pseudoSectionCount; // $this->count is initialized to 1 $enoughToc = $showToc && ( $numHeadings >= 4 || $tocBS ); if ( $forceTocBS ) { $showToc = true; $enoughToc = true; } if ( $numHeadings == 0 ) { $enoughToc = false; } if ( !$this->env->getPageConfig()->getSuppressTOC() ) { if ( $enoughToc ) { // ParserOutputFlags::SHOW_TOC $this->env->getMetadata()->setOutputFlag( 'show-toc' ); if ( !$tocBS ) { $syntheticTocMeta = $this->doc->createElement( 'meta' ); $syntheticTocMeta->setAttribute( 'property', 'mw:PageProp/toc' ); $dmw = DOMDataUtils::getDataMw( $syntheticTocMeta ); $dmw->autoGenerated = true; $tocIP = $this->findTOCInsertionPoint( DOMCompat::getBody( $this->doc ) ); if ( $tocIP === null ) { // should not happen, but nothing to do here! return; } // NOTE: Given how <section>s are computed in this file, headings // will never have previous siblings. So, we look at $eltSection's // previous siblings always. $insertionPoint = self::findSectionAncestor( $tocIP ); $insertionContainer = $insertionPoint->previousSibling; if ( !$insertionContainer || DOMCompat::nodeName( $insertionContainer ) !== 'section' ) { $insertionContainer = $this->insertSyntheticSection( $syntheticTocMeta, $insertionPoint ); } $insertionContainer->appendChild( $syntheticTocMeta ); // Set a synthetic zero-length dsr to suppress noisy warnings // from the round trip testing script. $syntheticOffset = DOMDataUtils::getDataParsoid( $tocIP )->dsr->start ?? null; if ( $syntheticOffset !== null ) { $dp = DOMDataUtils::getDataParsoid( $syntheticTocMeta ); $dp->dsr = new DomSourceRange( $syntheticOffset, $syntheticOffset, 0, 0 ); } } } if ( !$showToc ) { // ParserOutputFlags::NO_TOC $this->env->getMetadata()->setOutputFlag( 'no-toc' ); } } } /** Transfer information about section links from behaviour switches to CMC */ private function addSectionInfo() { $newSectionLink = $this->env->getBehaviorSwitch( 'newsectionlink' ); if ( $newSectionLink !== null ) { // ParserOutputFlags::NEW_SECTION $this->env->getMetadata()->setOutputFlag( 'mw-NewSection', $newSectionLink ); } $noNewSectionLink = $this->env->getBehaviorSwitch( 'nonewsectionlink' ); if ( $noNewSectionLink !== null ) { // ParserOutputFlags::HIDE_NEW_SECTION $this->env->getMetadata()->setOutputFlag( 'mw-HideNewSection', $noNewSectionLink ); } $noEditSection = $this->env->getBehaviorSwitch( 'noeditsection' ); if ( $noEditSection !== null ) { // ParserOutputFlags::NO_SECTION_EDIT_LINKS $this->env->getMetadata()->setOutputFlag( 'no-section-edit-links', $noEditSection ); } } /** * DOM Postprocessor entry function to walk DOM rooted at $root * and add <section> wrappers as necessary. * Implements the algorithm documented @ mw:Parsing/Notes/Section_Wrapping */ public function run(): void { // 6 is the lowest possible level since we don't want // any nesting of h-tags in the lead section $leadSection = new Section( 6, 0, $this->doc ); $leadSection->setId( 0 ); $this->wrapSectionsInDOM( $leadSection, $this->rootNode ); // There will always be a lead section, even if sometimes it only // contains whitespace + comments. $this->rootNode->insertBefore( $leadSection->container, $this->rootNode->firstChild ); // Resolve template conflicts after all sections have been added to the DOM $this->resolveTplExtSectionConflicts(); // Convert byte offsets to codepoint offsets in TOCData // (done in a batch to avoid O(N^2) string traversals) $this->convertTOCOffsets(); $this->addSyntheticTOCMarker(); $this->addSectionInfo(); } } PK ! ����� � PWrapState.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\Utils\DOMCompat; use Wikimedia\Parsoid\Utils\DOMUtils; class PWrapState { private const RANGE_TYPE_RE = '!^mw:(Transclusion(/|$)|Param(/|$)|Annotation/)!'; /** @var ?Element */ public $p = null; /** @var bool */ private $hasOptionalNode = false; /** * About ids of starts we've seen in this paragraph * * @var array */ private $seenStarts = []; /** * Unwrap + reset */ public function reset() { $this->unwrapTrailingPWrapOptionalNodes(); $this->p = null; $this->hasOptionalNode = false; $this->seenStarts = []; } /** * Record that we've encountered an optional node to potentially unwrap * * @param Node $n */ public function processOptionalNode( Node $n ) { $t = DOMUtils::matchNameAndTypeOf( $n, 'meta', self::RANGE_TYPE_RE ); $this->hasOptionalNode = (bool)$t || $this->hasOptionalNode; if ( $t && !str_ends_with( $t, '/End' ) ) { '@phan-var Element $n'; // @var Element $n $this->seenStarts[DOMCompat::getAttribute( $n, 'about' )] = true; } } /** * Unwrap a run of trailing nodes that don't need p-wrapping. * This only matters for meta tags representing transclusions * and annotations. Unwrapping can prevent unnecessary expansion * of template/annotation ranges. */ private function unwrapTrailingPWrapOptionalNodes() { if ( $this->hasOptionalNode ) { $lastChild = $this->p->lastChild; while ( PWrap::pWrapOptional( $lastChild ) ) { $t = DOMUtils::matchNameAndTypeOf( $lastChild, 'meta', self::RANGE_TYPE_RE ); if ( $t && str_ends_with( $t, '/End' ) ) { '@phan-var Element $lastChild'; // @var Element $lastChild // Check if one of its prior siblings has a matching opening tag. // If so, we are done with unwrapping here since we don't want to // hoist this closing tag by itself. $aboutId = DOMCompat::getAttribute( $lastChild, 'about' ); if ( $this->seenStarts[$aboutId] ?? null ) { break; } } $this->p->parentNode->insertBefore( $lastChild, $this->p->nextSibling ); $lastChild = $this->p->lastChild; } } } } PK ! �ǐ�: : WrapSectionsTplInfo.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\DOM\Node; class WrapSectionsTplInfo { public Element $first; // FIXME: This maybe-null feels broken. // This is because language variant markup is considered // encapsulated content (by WTUtils helpers) right now but // they may not have any about ids. public ?string $about; public Node $last; /** @var Node[] */ public array $rtContentNodes = []; public ?Section $firstSection; public ?Section $lastSection; } PK ! �2�nx+ x+ MarkFosteredContent.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; use Wikimedia\Assert\Assert; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\DOM\Comment; use Wikimedia\Parsoid\DOM\Document; use Wikimedia\Parsoid\DOM\DocumentFragment; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\DOM\Text; use Wikimedia\Parsoid\NodeData\DataParsoid; use Wikimedia\Parsoid\NodeData\TempData; use Wikimedia\Parsoid\Utils\DOMCompat; use Wikimedia\Parsoid\Utils\DOMDataUtils; use Wikimedia\Parsoid\Utils\DOMUtils; use Wikimedia\Parsoid\Utils\WTUtils; use Wikimedia\Parsoid\Wt2Html\Wt2HtmlDOMProcessor; /** * Non-IEW (inter-element-whitespace) can only be found in <td> <th> and * <caption> tags in a table. If found elsewhere within a table, such * content will be moved out of the table and be "adopted" by the table's * sibling ("foster parent"). The content that gets adopted is "fostered * content". * * http://www.w3.org/TR/html5/syntax.html#foster-parent * @module */ class MarkFosteredContent implements Wt2HtmlDOMProcessor { /** * Create a new DOM node with attributes. * * @param Document $document * @param string $type * @param array $attrs * @return Element */ private static function createNodeWithAttributes( Document $document, string $type, array $attrs ): Element { $node = $document->createElement( $type ); DOMUtils::addAttributes( $node, $attrs ); return $node; } /** * Cleans up transclusion shadows, keeping track of fostered transclusions * * @param Node $node * @return bool */ private static function removeTransclusionShadows( Node $node ): bool { $sibling = null; $fosteredTransclusions = false; if ( $node instanceof Element ) { if ( DOMUtils::isMarkerMeta( $node, 'mw:TransclusionShadow' ) ) { $node->parentNode->removeChild( $node ); return true; } elseif ( DOMDataUtils::getDataParsoid( $node )->getTempFlag( TempData::IN_TRANSCLUSION ) ) { $fosteredTransclusions = true; } $node = $node->firstChild; while ( $node ) { $sibling = $node->nextSibling; if ( self::removeTransclusionShadows( $node ) ) { $fosteredTransclusions = true; } $node = $sibling; } } return $fosteredTransclusions; } /** * Inserts metas around the fosterbox and table * * @param Env $env * @param Node $fosterBox * @param Element $table */ private static function insertTransclusionMetas( Env $env, Node $fosterBox, Element $table ): void { $aboutId = $env->newAboutId(); // Ensure we have depth entries for 'aboutId'. $docDataBag = DOMDataUtils::getBag( $table->ownerDocument ); $docDataBag->transclusionMetaTagDepthMap[$aboutId]['start'] = $docDataBag->transclusionMetaTagDepthMap[$aboutId]['end'] = DOMUtils::nodeDepth( $table ); // You might be asking yourself, why is $table->dataParsoid->tsr->end always // present? The earlier implementation searched the table's siblings for // their tsr->start. However, encapsulation doesn't happen when the foster box, // and thus the table, are in the transclusion. $s = self::createNodeWithAttributes( $fosterBox->ownerDocument, 'meta', [ 'about' => $aboutId, 'id' => substr( $aboutId, 1 ), 'typeof' => 'mw:Transclusion', ] ); $dp = new DataParsoid; $dp->tsr = clone DOMDataUtils::getDataParsoid( $table )->tsr; $dp->setTempFlag( TempData::FROM_FOSTER ); DOMDataUtils::setDataParsoid( $s, $dp ); $fosterBox->parentNode->insertBefore( $s, $fosterBox ); $e = self::createNodeWithAttributes( $table->ownerDocument, 'meta', [ 'about' => $aboutId, 'typeof' => 'mw:Transclusion/End', ] ); $sibling = $table->nextSibling; $beforeText = null; // Skip past the table end, mw:shadow and any transclusions that // start inside the table. There may be newlines and comments in // between so keep track of that, and backtrack when necessary. while ( $sibling ) { if ( !WTUtils::isTplStartMarkerMeta( $sibling ) && ( WTUtils::isEncapsulatedDOMForestRoot( $sibling ) || DOMUtils::isMarkerMeta( $sibling, 'mw:TransclusionShadow' ) ) ) { $sibling = $sibling->nextSibling; $beforeText = null; } elseif ( $sibling instanceof Comment || $sibling instanceof Text ) { if ( !$beforeText ) { $beforeText = $sibling; } $sibling = $sibling->nextSibling; } else { break; } } $table->parentNode->insertBefore( $e, $beforeText ?: $sibling ); } /** * @param Node $e * @param Node $firstFosteredNode * @param Element|DocumentFragment $tableParent * @param ?Node $tableNextSibling */ private static function moveFosteredAnnotations( Node $e, Node $firstFosteredNode, $tableParent, ?Node $tableNextSibling ): void { if ( WTUtils::isAnnotationStartMarkerMeta( $e ) && $e !== $firstFosteredNode ) { '@phan-var Element $e'; DOMDataUtils::getDataParsoid( $e )->wasMoved = true; $firstFosteredNode->parentNode->insertBefore( $e, $firstFosteredNode ); } elseif ( WTUtils::isAnnotationEndMarkerMeta( $e ) ) { '@phan-var Element $e'; DOMDataUtils::getDataParsoid( $e )->wasMoved = true; $tableParent->insertBefore( $e, $tableNextSibling ); } elseif ( $e instanceof Element && $e->hasChildNodes() ) { // avoid iterating over a mutated DOMNodeList $childNodeList = iterator_to_array( $e->childNodes ); foreach ( $childNodeList as $child ) { self::moveFosteredAnnotations( $child, $firstFosteredNode, $tableParent, $tableNextSibling ); } } } private static function getFosterContentHolder( Document $doc, bool $inPTag ): Element { $fosterContentHolder = $doc->createElement( $inPTag ? 'span' : 'p' ); $dp = new DataParsoid; $dp->fostered = true; // Set autoInsertedStart for bug-compatibility with the old ProcessTreeBuilderFixups code $dp->autoInsertedStart = true; DOMDataUtils::setDataParsoid( $fosterContentHolder, $dp ); return $fosterContentHolder; } /** * Searches for FosterBoxes and does two things when it hits one: * - Marks all nextSiblings as fostered until the accompanying table. * - Wraps the whole thing (table + fosterbox) with transclusion metas if * there is any fostered transclusion content. * * @param Node $node * @param Env $env */ private static function processRecursively( Node $node, Env $env ): void { $c = $node->firstChild; while ( $c ) { $sibling = $c->nextSibling; $fosteredTransclusions = false; if ( DOMUtils::hasNameAndTypeOf( $c, 'table', 'mw:FosterBox' ) ) { $inPTag = DOMUtils::hasNameOrHasAncestorOfName( $c->parentNode, 'p' ); $fosterContentHolder = self::getFosterContentHolder( $c->ownerDocument, $inPTag ); $fosteredElements = []; // mark as fostered until we hit the table while ( $sibling && ( !( $sibling instanceof Element ) || DOMCompat::nodeName( $sibling ) !== 'table' ) ) { $fosteredElements[] = $sibling; $next = $sibling->nextSibling; if ( $sibling instanceof Element ) { // TODO: Note the similarity here with the p-wrapping pass. // This can likely be combined in some more maintainable way. if ( DOMUtils::isRemexBlockNode( $sibling ) || PWrap::pWrapOptional( $sibling ) ) { // Block nodes don't need to be wrapped in a p-tag either. // Links, includeonly directives, and other rendering-transparent // nodes dont need wrappers. sol-transparent wikitext generate // rendering-transparent nodes and we use that helper as a proxy here. DOMDataUtils::getDataParsoid( $sibling )->fostered = true; // If the foster content holder is not empty, // close it and get a new content holder. if ( $fosterContentHolder->hasChildNodes() ) { $sibling->parentNode->insertBefore( $fosterContentHolder, $sibling ); $fosterContentHolder = self::getFosterContentHolder( $sibling->ownerDocument, $inPTag ); } } else { $fosterContentHolder->appendChild( $sibling ); } if ( self::removeTransclusionShadows( $sibling ) ) { $fosteredTransclusions = true; } } else { $fosterContentHolder->appendChild( $sibling ); } $sibling = $next; } $table = $sibling; // we should be able to reach the table from the fosterbox Assert::invariant( $table instanceof Element && DOMCompat::nodeName( $table ) === 'table', "Table isn't a sibling. Something's amiss!" ); if ( $fosterContentHolder->hasChildNodes() ) { $table->parentNode->insertBefore( $fosterContentHolder, $table ); } // we have fostered transclusions // wrap the whole thing in a transclusion if ( $fosteredTransclusions ) { self::insertTransclusionMetas( $env, $c, $table ); } // We have two possibilities here for the insertion of more than one meta tag after the table. // We can either keep them in the order of traversal (by keeping a reference to the initial // $table->nextSibling), or in reverse order of traversal (by updating $table->nextSibling to // the inserted meta. // This has different consequences depending on whether multiple ranges are nested or not. // If the fosterbox initially contains <ann1><ann2></ann2></ann1>, the end result for the first // possibility becomes <ann1><ann2>TABLE</ann2></ann1>. If the fosterbox initially contains // <ann1></ann1><ann2></ann2>, the end result becomes <ann1><ann2>TABLE</ann1></ann2>. The // consequences are inverted if we insert in reverse order of traversal. // Note that this is only relevant if the annotations are of different types and that, right // now, we only have two types of annotation (namely <translate> and <tvar>), and <tvar> can // only exist nested in <translate>. Hence, we choose to insert in traversal order so that we can // preserve existing nesting order. // (The last option would be to keep a stack of opening metas in the foster table and to re-add // them in inverse order at the end of the table. This would add significant code complexity for // what seems like marginal benefits at best as long as we do not have more annotation types.) $tableNextSibling = $table->nextSibling; $tableParent = $table->parentNode; // this needs to happen after inserting the transclusion meta so that they get // included in the transclusion foreach ( $fosteredElements as $elem ) { '@phan-var Element $elem'; self::moveFosteredAnnotations( $elem, $fosteredElements[0], $tableParent, $tableNextSibling ); } // remove the foster box $c->parentNode->removeChild( $c ); } elseif ( DOMUtils::isMarkerMeta( $c, 'mw:TransclusionShadow' ) ) { $c->parentNode->removeChild( $c ); } elseif ( $c instanceof Element ) { if ( $c->hasChildNodes() ) { self::processRecursively( $c, $env ); } } $c = $sibling; } } /** * @inheritDoc */ public function run( Env $env, Node $root, array $options = [], bool $atTopLevel = false ): void { self::processRecursively( $root, $env ); } } PK ! zƇss s Normalize.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\DOM\DocumentFragment; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\Utils\DOMCompat; use Wikimedia\Parsoid\Wt2Html\Wt2HtmlDOMProcessor; class Normalize implements Wt2HtmlDOMProcessor { /** * @inheritDoc */ public function run( Env $env, Node $root, array $options = [], bool $atTopLevel = false ): void { '@phan-var Element|DocumentFragment $root'; // @var Element|DocumentFragment $root DOMCompat::normalize( $root ); } } PK ! �%v�� � ConvertOffsets.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; use Wikimedia\Assert\Assert; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\Logger\LintLogger; use Wikimedia\Parsoid\Utils\ContentUtils; use Wikimedia\Parsoid\Utils\DOMDataUtils; use Wikimedia\Parsoid\Wt2Html\Wt2HtmlDOMProcessor; /** * Very thin shim to call ContentUtils::convertOffsets where requested * in the environment. */ class ConvertOffsets implements Wt2HtmlDOMProcessor { /** * DOM Postprocessor entry function to walk DOM rooted at $root * and convert the DSR offsets as needed. * @see ConvertUtils::convertOffsets * * @inheritDoc */ public function run( Env $env, Node $root, array $options = [], bool $atTopLevel = false ): void { Assert::invariant( $atTopLevel, 'This pass should only be run on the top-level' ); $doc = $root->ownerDocument; $offsetType = $env->getRequestOffsetType(); ContentUtils::convertOffsets( $env, $doc, 'byte', $offsetType ); // Because linter runs before this DOM pass, we need to convert offsets // of collected lints from 'byte' to the requested type if ( $offsetType !== 'byte' ) { $lints = $env->getLints(); LintLogger::convertDSROffsets( $env, $lints, 'byte', $offsetType ); $env->setLints( $lints ); } DOMDataUtils::getPageBundle( $doc )->parsoid['offsetType'] = $offsetType; } } PK ! ��t LangConverter.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; use Wikimedia\Assert\Assert; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\Language\LanguageConverter; use Wikimedia\Parsoid\Wt2Html\Wt2HtmlDOMProcessor; class LangConverter implements Wt2HtmlDOMProcessor { /** * @inheritDoc */ public function run( Env $env, Node $root, array $options = [], bool $atTopLevel = false ): void { Assert::invariant( $atTopLevel, 'This pass should only be run on the top-level' ); if ( $env->getSkipLanguageConversionPass() ) { return; } LanguageConverter::maybeConvert( $env, $root->ownerDocument, $env->getHtmlVariantLanguageBcp47(), $env->getWtVariantLanguageBcp47() ); } } PK ! H�Dg� � RunExtensionProcessors.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\Ext\DOMProcessor as ExtDOMProcessor; use Wikimedia\Parsoid\Wt2Html\Wt2HtmlDOMProcessor; /** * A wrapper to call extension-specific DOM processors. * * FIXME: There are two potential ordering problems here. * * 1. unpackDOMFragment should always run immediately * before these extensionPostProcessors, which we do currently. * This ensures packed content get processed correctly by extensions * before additional transformations are run on the DOM. * * This ordering issue is handled through documentation. * * 2. This has existed all along (in the PHP parser as well as Parsoid * which is probably how the ref-in-ref hack works - because of how * parser functions and extension tags are procesed, #tag:ref doesn't * see a nested ref anymore) and this patch only exposes that problem * more clearly with the unpackOutput property. * * * Consider the set of extensions that * (a) process wikitext * (b) provide an extensionPostProcessor * (c) run the extensionPostProcessor only on the top-level * As of today, there is exactly one extension (Cite) that has all * these properties, so the problem below is a speculative problem * for today. But, this could potentially be a problem in the future. * * * Let us say there are at least two of them, E1 and E2 that * support extension tags <e1> and <e2> respectively. * * * Let us say in an instance of <e1> on the page, <e2> is present * and in another instance of <e2> on the page, <e1> is present. * * * In what order should E1's and E2's extensionPostProcessors be * run on the top-level? Depending on what these handlers do, you * could get potentially different results. You can see this quite * starkly with the unpackOutput flag. * * * The ideal solution to this problem is to require that every extension's * extensionPostProcessor be idempotent which lets us run these * post processors repeatedly till the DOM stabilizes. But, this * still doesn't necessarily guarantee that ordering doesn't matter. * It just guarantees that with the unpackOutput flag set to false * multiple extensions, all sealed fragments get fully processed. * So, we still need to worry about that problem. * * But, idempotence *could* potentially be a sufficient property in most cases. * To see this, consider that there is a Footnotes extension which is similar * to the Cite extension in that they both extract inline content in the * page source to a separate section of output and leave behind pointers to * the global section in the output DOM. Given this, the Cite and Footnote * extension post processors would essentially walk the dom and * move any existing inline content into that global section till it is * done. So, even if a <footnote> has a <ref> and a <ref> has a <footnote>, * we ultimately end up with all footnote content in the footnotes section * and all ref content in the references section and the DOM stabilizes. * Ordering is irrelevant here. * * So, perhaps one way of catching these problems would be in code review * by analyzing what the DOM postprocessor does and see if it introduces * potential ordering issues. */ class RunExtensionProcessors implements Wt2HtmlDOMProcessor { private ?array $extProcessors = null; /** * FIXME: We've lost the ability to dump dom pre/post individual * extension processors. Need to fix RunExtensionProcessors to * reintroduce that granularity */ private function initialize( Env $env ): array { $extProcessors = []; foreach ( $env->getSiteConfig()->getExtDOMProcessors() as $extName => $domProcs ) { foreach ( $domProcs as $i => $classNameOrSpec ) { // Extension post processor, object factory spec given $objectFactory = $env->getSiteConfig()->getObjectFactory(); $extProcessors[] = $objectFactory->createObject( $classNameOrSpec, [ 'allowClassName' => true, 'assertClass' => ExtDOMProcessor::class, ] ); } } return $extProcessors; } /** * @inheritDoc */ public function run( Env $env, Node $root, array $options = [], bool $atTopLevel = false ): void { $this->extProcessors ??= $this->initialize( $env ); foreach ( $this->extProcessors as $ep ) { $ep->wtPostprocess( $options['extApi'], $root, $options ); } } } PK ! ��$ $ WrapTemplates.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\Wt2Html\Wt2HtmlDOMProcessor; class WrapTemplates implements Wt2HtmlDOMProcessor { /** * Encapsulate template-affected DOM structures by wrapping text nodes into * spans and adding RDFa attributes to all subtree roots according to * http://www.mediawiki.org/wiki/Parsoid/RDFa_vocabulary#Template_content * @inheritDoc */ public function run( Env $env, Node $root, array $options = [], bool $atTopLevel = false ): void { // Don't run this in template content if ( $options['inTemplate'] ) { return; } $op = new DOMRangeBuilder( $root->ownerDocument, $options['frame'] ); $op->execute( $root ); } } PK ! �ip�2 2 Section.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; use Wikimedia\Parsoid\Core\SectionMetadata; use Wikimedia\Parsoid\DOM\Document; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\DOM\Node; class Section { /** @var int */ private $level; /** * Useful during debugging, unrelated to data-mw-section-id * @var int */ private $debugId; /** @var Element */ public $container; /** @var SectionMetadata */ public $metadata; public function __construct( int $level, int $debugId, Document $ownerDoc ) { $this->level = $level; $this->debugId = $debugId; $this->container = $ownerDoc->createElement( 'section' ); // Use named arguments here in PHP 8.0+ $this->metadata = new SectionMetadata( -1, /* tocLevel */ $level /* hLevel */ ); } public function setId( int $id ): void { $this->container->setAttribute( 'data-mw-section-id', (string)$id ); // $this->container->setAttribute( 'data-debug-id', (string)$this->debugId ); } public function setAboutId( string $aboutId ): void { $this->container->setAttribute( 'about', $aboutId ); } public function addNode( Node $node ): void { $this->container->appendChild( $node ); } public function addSection( Section $section ): void { // error_log( "Appending to " . $this->debugId . '\n' ); $this->container->appendChild( $section->container ); } /** * Does this section have a nesting level of $level? * @param int $level * @return bool */ public function hasNestedLevel( int $level ): bool { return $level > $this->level; } } PK ! �xѧ � DOMRangeInfoArray.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; class DOMRangeInfoArray { /** @var DOMRangeInfo[]|null */ public $ranges; } PK ! ��C� � AddRedLinks.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; use DOMDocument; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\DOM\DocumentFragment; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\Language\LanguageConverter; use Wikimedia\Parsoid\Utils\DOMCompat; use Wikimedia\Parsoid\Utils\DOMDataUtils; use Wikimedia\Parsoid\Utils\PHPUtils; use Wikimedia\Parsoid\Utils\UrlUtils; use Wikimedia\Parsoid\Utils\WTUtils; use Wikimedia\Parsoid\Wt2Html\Wt2HtmlDOMProcessor; class AddRedLinks implements Wt2HtmlDOMProcessor { /** * Batch size to use for fetching page data to avoid exceeding LinkCache::MAX_SIZE */ private const LINK_BATCH_SIZE = 1000; /** * Add red links to a document. * * @inheritDoc */ public function run( Env $env, Node $root, array $options = [], bool $atTopLevel = false ): void { '@phan-var Element|DocumentFragment $root'; // @var Element|DocumentFragment $root $allLinks = PHPUtils::iterable_to_array( DOMCompat::querySelectorAll( $root, 'a[rel~="mw:WikiLink"]' ) ); // Split up processing into chunks of 1000 so that we don't exceed LinkCache::MAX_SIZE $chunks = array_chunk( $allLinks, self::LINK_BATCH_SIZE ); foreach ( $chunks as $links ) { $titles = []; foreach ( $links as $a ) { $t = DOMCompat::getAttribute( $a, 'title' ); if ( $t !== null ) { $titles[$t] = true; } } if ( !$titles ) { return; } $start = microtime( true ); $titleMap = $env->getDataAccess()->getPageInfo( $env->getPageConfig(), array_keys( $titles ) ); if ( $env->profiling() ) { $profile = $env->getCurrentProfile(); $profile->bumpMWTime( "RedLinks", 1000 * ( microtime( true ) - $start ), "api" ); $profile->bumpCount( "RedLinks" ); } $prefixedTitleText = $env->getContextTitle()->getPrefixedText(); $variantMap = $this->getVariantTitles( $env, $root->ownerDocument, $titles, $titleMap ); foreach ( $links as $a ) { $k = DOMCompat::getAttribute( $a, 'title' ); if ( $k === null ) { continue; } $variantData = $variantMap[$k] ?? null; $data = $variantData ?? $titleMap[$k] ?? null; if ( $data === null ) { // Likely a consequence of T237535; can be removed once // that is fixed. $env->log( 'warn', 'We should have data for the title: ' . $k ); continue; } // Convert links pointing to a variant title (T258856) if ( $variantData !== null ) { $variantTitle = $env->makeTitleFromURLDecodedStr( $variantData['variantTitle'] ); $origHref = DOMCompat::getAttribute( $a, 'href' ); $origUrl = UrlUtils::parseUrl( $origHref ?? '' ); $newUrl = UrlUtils::parseUrl( $env->makeLink( $variantTitle ) ); $newUrl['query'] = $origUrl['query']; $newUrl['fragment'] = $origUrl['fragment']; $variantPrefixedText = $variantTitle->getPrefixedText(); DOMDataUtils::addNormalizedAttribute( $a, 'title', $variantPrefixedText, $k ); // Set $k to the new title for the selflink check below. // Note that getVariantTitles doesn't set $variantData for // missing titles, so we won't be in this block for the // red-link-title case below. $k = $variantPrefixedText; DOMDataUtils::addNormalizedAttribute( $a, 'href', UrlUtils::assembleUrl( $newUrl ), $origHref, // Ensure we preserve the real original value // added during initial link parsing. true ); } $a->removeAttribute( 'class' ); // Clear all, if we're doing a pb2pb refresh $href = DOMCompat::getAttribute( $a, 'href' ); $parsedURL = UrlUtils::parseUrl( $href ?? '' ); $queryElts = []; if ( isset( $parsedURL['query'] ) ) { parse_str( $parsedURL['query'], $queryElts ); } if ( !empty( $data['missing'] ) && empty( $data['known'] ) && $k !== $prefixedTitleText ) { DOMCompat::getClassList( $a )->add( 'new' ); WTUtils::addPageContentI18nAttribute( $a, 'title', 'red-link-title', [ $k ] ); $queryElts['action'] = 'edit'; $queryElts['redlink'] = '1'; } else { if ( $k === $prefixedTitleText ) { if ( isset( $parsedURL['fragment'] ) ) { DOMCompat::getClassList( $a )->add( 'mw-selflink-fragment' ); } else { DOMCompat::getClassList( $a )->add( 'mw-selflink', 'selflink' ); } $a->removeAttribute( 'title' ); } // Clear a potential redlink, if we're doing a pb2pb refresh // This is similar to what's happening in Html2Wt/RemoveRedLinks // and maybe that pass should just run before this one. if ( isset( $queryElts['action'] ) && $queryElts['action'] === 'edit' ) { unset( $queryElts['action'] ); } if ( isset( $queryElts['redlink'] ) && $queryElts['redlink'] === '1' ) { unset( $queryElts['redlink'] ); } } if ( count( $queryElts ) === 0 ) { // avoids the insertion of ? on empty query string $parsedURL['query'] = null; } else { $parsedURL['query'] = http_build_query( $queryElts ); } $newHref = UrlUtils::assembleUrl( $parsedURL ); $a->setAttribute( 'href', $newHref ); if ( !empty( $data['redirect'] ) ) { DOMCompat::getClassList( $a )->add( 'mw-redirect' ); } foreach ( $data['linkclasses'] ?? [] as $extraClass ) { DOMCompat::getClassList( $a )->add( $extraClass ); } } } } /** * Attempt to resolve nonexistent link targets using their variants (T258856) * * @param Env $env * @param DOMDocument $doc * @param array $titles map keyed by page titles * @param array $titleMap map of resolved page data keyed by title * @return array map of resolved variant page data keyed by original title */ private function getVariantTitles( Env $env, DOMDocument $doc, array $titles, array $titleMap ): array { // Optimize for the common case where the page language has no variants if ( !$env->langConverterEnabled() ) { return []; } $origsByVariant = []; // Gather all nonexistent page titles to search for their variants foreach ( array_keys( $titles ) as $title ) { if ( // T237535 isset( $titleMap[$title] ) && ( empty( $titleMap[$title]['missing'] ) || !empty( $titleMap[$title]['known'] ) ) ) { continue; } // array_keys converts strings representing numbers to ints. // So, cast $title to string explicitly. $variantTitles = LanguageConverter::autoConvertToAllVariants( $env, $doc, (string)$title ); foreach ( $variantTitles as $variantTitle ) { $origsByVariant[$variantTitle][] = $title; } } $variantsByOrig = []; $variantTitles = array_keys( $origsByVariant ); foreach ( array_chunk( $variantTitles, self::LINK_BATCH_SIZE ) as $variantChunk ) { $variantChunkData = $env->getDataAccess()->getPageInfo( $env->getPageConfig(), $variantChunk ); // Map resolved variant titles to their corresponding originals foreach ( $variantChunkData as $variantTitle => $pageData ) { // Handle invalid titles // For example, a conversion might result in a title that's too long. if ( !empty( $pageData['invalid'] ) ) { continue; } // Handle non-existent variant titles if ( !empty( $pageData['missing'] ) && empty( $pageData['known'] ) ) { continue; } foreach ( $origsByVariant[$variantTitle] as $origTitle ) { $variantsByOrig[$origTitle] = [ 'variantTitle' => $variantTitle ] + $pageData; } } } return $variantsByOrig; } } PK ! ��_pE E RangeBuilderException.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; use RuntimeException; /** * Class RangeBuilderException * Thrown when a DOMRangeBuilder encounters an unexpected state * @package Wikimedia\Parsoid\Wt2Html\DOM\Processors */ class RangeBuilderException extends RuntimeException { } PK ! �ܶ d d ComputeDSR.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\Core\DomSourceRange; use Wikimedia\Parsoid\DOM\Comment; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\DOM\Text; use Wikimedia\Parsoid\NodeData\DataParsoid; use Wikimedia\Parsoid\Utils\DOMCompat; use Wikimedia\Parsoid\Utils\DOMDataUtils; use Wikimedia\Parsoid\Utils\DOMUtils; use Wikimedia\Parsoid\Utils\PHPUtils; use Wikimedia\Parsoid\Utils\Utils; use Wikimedia\Parsoid\Utils\WTUtils; use Wikimedia\Parsoid\Wikitext\Consts; use Wikimedia\Parsoid\Wt2Html\Frame; use Wikimedia\Parsoid\Wt2Html\TT\PreHandler; use Wikimedia\Parsoid\Wt2Html\Wt2HtmlDOMProcessor; class ComputeDSR implements Wt2HtmlDOMProcessor { /** * For an explanation of what TSR is, see ComputeDSR::computeNodeDSR() * * TSR info on all these tags are only valid for the opening tag. * * On other tags, a, hr, br, meta-marker tags, the tsr spans * the entire DOM, not just the tag. * * This code is not in Wikitext\Consts.php because this * information is Parsoid-implementation-specific. */ private const WT_TAGS_WITH_LIMITED_TSR = [ "b" => true, "i" => true, "h1" => true, "h2" => true, "h3" => true, "h4" => true, "h5" => true, "h6" => true, "ul" => true, "ol" => true, "dl" => true, "li" => true, "dt" => true, "dd" => true, "table" => true, "caption" => true, "tr" => true, "td" => true, "th" => true, "hr" => true, // void element "br" => true, // void element "pre" => true, ]; /** * Do $parsoidData->tsr values span the entire DOM subtree rooted at $n? * * @param Element $n * @param DataParsoid $parsoidData * @return bool */ private function tsrSpansTagDOM( Element $n, DataParsoid $parsoidData ): bool { // - tags known to have tag-specific tsr // - html tags with 'stx' set // - tags with certain typeof properties (Parsoid-generated // constructs: placeholders, lang variants) $name = DOMCompat::nodeName( $n ); return !( isset( self::WT_TAGS_WITH_LIMITED_TSR[$name] ) || DOMUtils::matchTypeOf( $n, '/^mw:(Placeholder|LanguageVariant)$/D' ) || WTUtils::hasLiteralHTMLMarker( $parsoidData ) ); } /** * Is the inconsistency between two different ways of computing * start offset ($cs, $s) explainable and acceptable? * If so, we can suppress warnings. * * @param array $opts * @param Node $node * @param int $cs * @param int $s * @return bool */ private function acceptableInconsistency( array $opts, Node $node, int $cs, int $s ): bool { /** * 1. For wikitext URL links, suppress cs-s diff warnings because * the diffs can come about because of various reasons since the * canonicalized/decoded href will become the a-link text whose width * will not match the tsr width of source wikitext * * (a) urls with encoded chars (ex: 'http://example.com/?foo=bar') * (b) non-canonical spaces (ex: 'RFC 123' instead of 'RFC 123') * * 2. We currently don't have source offsets for attributes. * So, we get a lot of spurious complaints about cs/s mismatch * when DSR computation hit the <body> tag on this attribute. * $opts['attrExpansion'] tell us when we are processing an attribute * and let us suppress the mismatch warning on the <body> tag. * * 3. Other scenarios .. to be added */ if ( $node instanceof Element && ( WTUtils::isATagFromURLLinkSyntax( $node ) || WTUtils::isATagFromMagicLinkSyntax( $node ) ) ) { return true; } elseif ( isset( $opts['attrExpansion'] ) && DOMUtils::atTheTop( $node ) ) { return true; } else { return false; } } /** * Compute wikitext string length that contributes to this * list item's open tag. Closing tag width is always 0 for lists. * * @param Element $li * @return int */ private function computeListEltWidth( Element $li ): int { if ( !$li->previousSibling && $li->firstChild ) { if ( DOMUtils::isList( $li->firstChild ) ) { // Special case!! // First child of a list that is on a chain // of nested lists doesn't get a width. return 0; } } // count nest listing depth and assign // that to the opening tag width. $depth = 0; // This is the crux of the algorithm in DOMHandler::getListBullets() while ( !DOMUtils::atTheTop( $li ) ) { $dp = DOMDataUtils::getDataParsoid( $li ); if ( DOMUtils::isListOrListItem( $li ) ) { if ( DOMUtils::isListItem( $li ) ) { $depth++; } } elseif ( !WTUtils::isLiteralHTMLNode( $li ) || empty( $dp->autoInsertedStart ) || empty( $dp->autoInsertedEnd ) ) { break; } $li = $li->parentNode; } return $depth; } /** * Compute wikitext string lengths that contribute to this * anchor's opening (<a>) and closing (</a>) tags. * * @param Element $node * @param ?DataParsoid $dp * @return int[]|null */ private function computeATagWidth( Element $node, ?DataParsoid $dp ): ?array { /* ------------------------------------------------------------- * Tag widths are computed as per this logic here: * * 1. [[Foo|bar]] <-- piped mw:WikiLink * -> start-tag: "[[Foo|" * -> content : "bar" * -> end-tag : "]]" * * 2. [[Foo]] <-- non-piped mw:WikiLink * -> start-tag: "[[" * -> content : "Foo" * -> end-tag : "]]" * * 3. [[{{1x|Foo}}|Foo]] <-- tpl-attr mw:WikiLink * Don't bother setting tag widths since dp->sa['href'] will be * the expanded target and won't correspond to original source. * * 4. [http://wp.org foo] <-- mw:ExtLink * -> start-tag: "[http://wp.org " * -> content : "foo" * -> end-tag : "]" * -------------------------------------------------------------- */ if ( !$dp ) { return null; } else { if ( WTUtils::isATagFromWikiLinkSyntax( $node ) && !WTUtils::hasExpandedAttrsType( $node ) ) { if ( isset( $dp->stx ) && $dp->stx === "piped" ) { // this seems like some kind of a phan bug $href = $dp->sa['href'] ?? null; if ( $href ) { return [ strlen( $href ) + 3, 2 ]; } else { return null; } } else { return [ 2, 2 ]; } } elseif ( isset( $dp->tsr ) && WTUtils::isATagFromExtLinkSyntax( $node ) ) { return [ $dp->tmp->extLinkContentOffsets->start - $dp->tsr->start, 1 ]; } elseif ( WTUtils::isATagFromURLLinkSyntax( $node ) || WTUtils::isATagFromMagicLinkSyntax( $node ) ) { return [ 0, 0 ]; } else { return null; } } } /** * Compute wikitext string lengths that contribute to this * node's opening and closing tags. * * @param int|null $stWidth Start tag width * @param int|null $etWidth End tag width * @param Element $node * @param DataParsoid $dp * @return int[] Start and end tag widths */ private function computeTagWidths( $stWidth, $etWidth, Element $node, DataParsoid $dp ): array { if ( isset( $dp->extTagOffsets ) ) { return [ $dp->extTagOffsets->openWidth, $dp->extTagOffsets->closeWidth ]; } if ( WTUtils::hasLiteralHTMLMarker( $dp ) ) { if ( !empty( $dp->selfClose ) ) { $etWidth = 0; } } elseif ( DOMUtils::hasTypeOf( $node, 'mw:LanguageVariant' ) ) { $stWidth = 2; // -{ $etWidth = 2; // }- } else { $nodeName = DOMCompat::nodeName( $node ); // 'tr' tags not in the original source have zero width if ( $nodeName === 'tr' && !isset( $dp->startTagSrc ) ) { $stWidth = 0; $etWidth = 0; } else { $wtTagWidth = Consts::$WtTagWidths[$nodeName] ?? null; if ( $stWidth === null ) { // we didn't have a tsr to tell us how wide this tag was. if ( $nodeName === 'a' ) { $wtTagWidth = $this->computeATagWidth( $node, $dp ); $stWidth = $wtTagWidth ? $wtTagWidth[0] : null; } elseif ( $nodeName === 'li' || $nodeName === 'dd' ) { $stWidth = $this->computeListEltWidth( $node ); } elseif ( $wtTagWidth ) { $stWidth = $wtTagWidth[0]; } } if ( $etWidth === null && $wtTagWidth ) { $etWidth = $wtTagWidth[1]; } } } return [ $stWidth, $etWidth ]; } /** * @param Env $env * @param mixed ...$args */ private function trace( Env $env, ...$args ): void { $env->log( "trace/dsr", static function () use ( $args ) { $buf = ''; foreach ( $args as $arg ) { $buf .= is_string( $arg ) ? $arg : PHPUtils::jsonEncode( $arg ); } return $buf; } ); } /** * TSR = "Tag Source Range". Start and end offsets giving the location * where the tag showed up in the original source. * * DSR = "DOM Source Range". dsr->start and dsr->end are open and end, * dsr->openWidth and dsr->closeWidth are widths of the container tag. * * TSR is set by the tokenizer. In most cases, it only applies to the * specific tag (opening or closing). However, for self-closing * tags that the tokenizer generates, the TSR values applies to the entire * DOM subtree (opening tag + content + closing tag). * * Ex: So [[foo]] will get tokenized to a SelfClosingTagTk(...) with a TSR * value of [0,7]. The DSR algorithm will then use that info and assign * the a-tag rooted at the <a href='...'>foo</a> DOM subtree a DSR value of * [0,7,2,2], where 2 and 2 refer to the opening and closing tag widths. * * [s,e) -- if defined, start/end position of wikitext source that generated * node's subtree * * @param Frame $frame * @param Node $node node to process * @param ?int $s start position, inclusive * @param ?int $e end position, exclusive * @param int $dsrCorrection * @param array $opts * @return array */ private function computeNodeDSR( Frame $frame, Node $node, ?int $s, ?int $e, int $dsrCorrection, array $opts ): array { $env = $frame->getEnv(); if ( $e === null && !$node->hasChildNodes() ) { $e = $s; } $this->trace( $env, "BEG: ", DOMCompat::nodeName( $node ), " with [s, e]=", [ $s, $e ] ); /** @var int|null $ce Child end */ $ce = $e; // Initialize $cs to $ce to handle the zero-children case properly // if this $node has no child content, then the start and end for // the child dom are indeed identical. Alternatively, we could // explicitly code this check before everything and bypass this. /** @var int|null $cs Child start */ $cs = $ce; $child = $node->lastChild; while ( $child !== null ) { $prevChild = $child->previousSibling; $origCE = $ce; $cType = $child->nodeType; $fosteredNode = false; $cs = null; if ( $child instanceof Element ) { $dp = DOMDataUtils::getDataParsoid( $child ); $endTSR = $dp->tmp->endTSR ?? null; if ( $endTSR ) { $ce = $endTSR->end; } } else { $endTSR = null; } // StrippedTag marker tags will be removed and won't // be around to fill in the missing gap. So, absorb its width into // the DSR of its previous sibling. Currently, this fix is only for // B and I tags where the fix is clear-cut and obvious. $next = $child->nextSibling; if ( $next instanceof Element ) { $ndp = DOMDataUtils::getDataParsoid( $next ); if ( isset( $ndp->src ) && DOMUtils::hasTypeOf( $next, 'mw:Placeholder/StrippedTag' ) && // NOTE: This inlist check matches the case in CleanUp where // the placeholders are not removed from the DOM. We don't want // to move the width into the sibling here and then leave around a // a zero width placeholder because serializeDOMNode only handles // a few cases of zero width nodes, so we'll end up duplicating // it from ->src. !DOMUtils::isNestedInListItem( $next ) ) { if ( isset( Consts::$WTQuoteTags[$ndp->name] ) && isset( Consts::$WTQuoteTags[DOMCompat::nodeName( $child )] ) ) { $correction = strlen( $ndp->src ); $ce += $correction; $dsrCorrection = $correction; if ( Utils::isValidDSR( $ndp->dsr ?? null ) ) { // Record original DSR for the meta tag // since it will now get corrected to zero width // since child acquires its width-> $ndp->getTemp()->origDSR = new DomSourceRange( $ndp->dsr->start, $ndp->dsr->end, null, null ); } } } } $env->log( "trace/dsr", static function () use ( $child, $cs, $ce ) { // slow, for debugging only $i = 0; foreach ( $child->parentNode->childNodes as $x ) { if ( $x === $child ) { break; } $i++; } return " CHILD: <" . DOMCompat::nodeName( $child->parentNode ) . ":" . $i . ">=" . ( $child instanceof Element ? '' : ( $child instanceof Text ? '#' : '!' ) ) . ( ( $child instanceof Element ) ? ( DOMCompat::nodeName( $child ) === 'meta' ? DOMCompat::getOuterHTML( $child ) : DOMCompat::nodeName( $child ) ) : PHPUtils::jsonEncode( $child->nodeValue ) ) . " with " . PHPUtils::jsonEncode( [ $cs, $ce ] ); } ); if ( $cType === XML_TEXT_NODE ) { if ( $ce !== null ) { $cs = $ce - strlen( $child->textContent ); } } elseif ( $cType === XML_COMMENT_NODE ) { '@phan-var Comment $child'; // @var Comment $child if ( $ce !== null ) { // Decode HTML entities & re-encode as wikitext to find length $cs = $ce - WTUtils::decodedCommentLength( $child ); } } elseif ( $cType === XML_ELEMENT_NODE ) { DOMUtils::assertElt( $child ); $dp = DOMDataUtils::getDataParsoid( $child ); $tsr = $dp->tsr ?? null; $oldCE = $tsr ? $tsr->end : null; $propagateRight = false; $stWidth = null; $etWidth = null; $fosteredNode = $dp->fostered ?? false; // We are making dsr corrections to account for // stripped tags (end tags usually). When stripping happens, // in most common use cases, a corresponding end tag is added // back elsewhere in the DOM. // // So, when an autoInsertedEnd tag is encountered and a matching // dsr-correction is found, make a 1-time correction in the // other direction. // // Currently, this fix is only for // B and I tags where the fix is clear-cut and obvious. if ( $ce !== null && !empty( $dp->autoInsertedEnd ) && DOMUtils::isQuoteElt( $child ) ) { $correction = 3 + strlen( DOMCompat::nodeName( $child ) ); if ( $correction === $dsrCorrection ) { $ce -= $correction; $dsrCorrection = 0; } } if ( DOMCompat::nodeName( $child ) === "meta" ) { if ( $tsr ) { if ( WTUtils::isTplMarkerMeta( $child ) ) { // If this is a meta-marker tag (for templates, extensions), // we have a new valid '$cs'. This marker also effectively resets tsr // back to the top-level wikitext source range from nested template // source range. $cs = $tsr->start; $ce = $tsr->end; $propagateRight = true; } else { // All other meta-tags: <includeonly>, <noinclude>, etc. $cs = $tsr->start; $ce = $tsr->end; } } elseif ( PreHandler::isIndentPreWS( $child ) ) { // Adjust start DSR; see PreHandler::newIndentPreWS() $cs = $ce - 1; } elseif ( DOMUtils::matchTypeOf( $child, '#^mw:Placeholder(/\w*)?$#D' ) && $ce !== null && $dp->src ) { $cs = $ce - strlen( $dp->src ); } if ( isset( $dp->extTagOffsets ) ) { $stWidth = $dp->extTagOffsets->openWidth; $etWidth = $dp->extTagOffsets->closeWidth; unset( $dp->extTagOffsets ); } } elseif ( DOMUtils::hasTypeOf( $child, "mw:Entity" ) && $ce !== null && $dp->src ) { $cs = $ce - strlen( $dp->src ); } else { if ( DOMUtils::matchTypeOf( $child, '#^mw:Placeholder(/\w*)?$#D' ) && $ce !== null && $dp->src ) { $cs = $ce - strlen( $dp->src ); } else { // Non-meta tags if ( $endTSR ) { $etWidth = $endTSR->length(); } if ( $tsr && empty( $dp->autoInsertedStart ) ) { $cs = $tsr->start; if ( $this->tsrSpansTagDOM( $child, $dp ) ) { if ( $tsr->end !== null && $tsr->end > 0 ) { $ce = $tsr->end; $propagateRight = true; } } else { $stWidth = $tsr->end - $tsr->start; } $this->trace( $env, " TSR: ", $tsr, "; cs: ", $cs, "; ce: ", $ce ); } elseif ( $s && $child->previousSibling === null ) { $cs = $s; } } // Compute width of opening/closing tags for this dom $node [ $stWidth, $etWidth ] = $this->computeTagWidths( $stWidth, $etWidth, $child, $dp ); if ( !empty( $dp->autoInsertedStart ) ) { $stWidth = 0; } if ( !empty( $dp->autoInsertedEnd ) ) { $etWidth = 0; } $ccs = $cs !== null && $stWidth !== null ? $cs + $stWidth : null; $cce = $ce !== null && $etWidth !== null ? $ce - $etWidth : null; /* ----------------------------------------------------------------- * Process DOM rooted at '$child'. * * NOTE: You might wonder why we are not checking for the zero-$children * case. It is strictly not necessary and you can set newDsr directly. * * But, you have 2 options: [$ccs, $ccs] or [$cce, $cce]. Setting it to * [$cce, $cce] would be consistent with the RTL approach. We should * then compare $ccs and $cce and verify that they are identical. * * But, if we handled the zero-child case like the other scenarios, * we don't have to worry about the above decisions and checks. * ----------------------------------------------------------------- */ if ( WTUtils::isDOMFragmentWrapper( $child ) || DOMUtils::hasTypeOf( $child, 'mw:LanguageVariant' ) ) { // Eliminate artificial $cs/s mismatch warnings since this is // just a wrapper token with the right DSR but without any // nested subtree that could account for the DSR span. $newDsr = [ $ccs, $cce ]; } elseif ( $child instanceof Element && WTUtils::isATagFromWikiLinkSyntax( $child ) && ( !isset( $dp->stx ) || $dp->stx !== "piped" ) ) { /* ------------------------------------------------------------- * This check here eliminates artificial DSR mismatches on content * text of the A-node because of entity expansion, etc. * * Ex: [[7%25 solution]] will be rendered as: * <a href=....>7% solution</a> * If we descend into the text for the a-node, we'll have a 2-char * DSR mismatch which will trigger artificial error warnings. * * In the non-piped link scenario, all dsr info is already present * in the link target and so we get nothing new by processing * content. * ------------------------------------------------------------- */ $newDsr = [ $ccs, $cce ]; } else { $env->log( "trace/dsr", static function () use ( $env, $cs, $ce, $stWidth, $etWidth, $ccs, $cce ) { return " before-recursing:" . "[cs,ce]=" . PHPUtils::jsonEncode( [ $cs, $ce ] ) . "; [sw,ew]=" . PHPUtils::jsonEncode( [ $stWidth, $etWidth ] ) . "; subtree-[cs,ce]=" . PHPUtils::jsonEncode( [ $ccs, $cce ] ); } ); $this->trace( $env, "<recursion>" ); $newDsr = $this->computeNodeDSR( $frame, $child, $ccs, $cce, $dsrCorrection, $opts ); $this->trace( $env, "</recursion>" ); } // $cs = min($child-dom-tree dsr->start - tag-width, current dsr->start) if ( $stWidth !== null && $newDsr[0] !== null ) { $newCs = $newDsr[0] - $stWidth; if ( $cs === null || ( !$tsr && $newCs < $cs ) ) { $cs = $newCs; } } // $ce = max($child-dom-tree dsr->end + tag-width, current dsr->end) if ( $etWidth !== null && $newDsr[1] !== null ) { $newCe = $newDsr[1] + $etWidth; if ( $newCe > $ce ) { $ce = $newCe; } } } if ( $cs !== null || $ce !== null ) { if ( $ce < 0 ) { if ( !$fosteredNode ) { $env->log( "info/dsr/negative", "Negative DSR for node: " . DOMCompat::nodeName( $node ) . "; resetting to zero" ); } $ce = 0; } // Fostered $nodes get a zero-dsr width range. if ( $fosteredNode ) { // Reset to 0, if necessary. // This is critical to avoid duplication of fostered content in selser mode. if ( $origCE < 0 ) { $origCE = 0; } $dp->dsr = new DomSourceRange( $origCE, $origCE, null, null ); } else { $dp->dsr = new DomSourceRange( $cs, $ce, $stWidth, $etWidth ); } $env->log( "trace/dsr", static function () use ( $frame, $child, $cs, $ce, $dp ) { return " UPDATING " . DOMCompat::nodeName( $child ) . " with " . PHPUtils::jsonEncode( [ $cs, $ce ] ) . "; typeof: " . ( DOMCompat::getAttribute( $child, "typeof" ) ?? '' ); } ); } // Propagate any required changes to the right // taking care not to cross-over into template content if ( $ce !== null && ( $propagateRight || $oldCE !== $ce || $e === null ) && !WTUtils::isTplStartMarkerMeta( $child ) ) { $sibling = $child->nextSibling; $newCE = $ce; while ( $newCE !== null && $sibling && !WTUtils::isTplStartMarkerMeta( $sibling ) ) { $nType = $sibling->nodeType; if ( $nType === XML_TEXT_NODE ) { $newCE += strlen( $sibling->textContent ); } elseif ( $nType === XML_COMMENT_NODE ) { '@phan-var Comment $sibling'; // @var Comment $sibling $newCE += WTUtils::decodedCommentLength( $sibling ); } elseif ( $nType === XML_ELEMENT_NODE ) { DOMUtils::assertElt( $sibling ); $siblingDP = DOMDataUtils::getDataParsoid( $sibling ); $siblingDP->dsr ??= new DomSourceRange( null, null, null, null ); $sdsrStart = $siblingDP->dsr->start; if ( !empty( $siblingDP->fostered ) || ( $sdsrStart !== null && $sdsrStart === $newCE ) || ( $sdsrStart !== null && $sdsrStart < $newCE && isset( $siblingDP->tsr ) ) ) { // $sibling is fostered // => nothing to propagate past it // $sibling's dsr->start matches what we might propagate // => nothing will change // $sibling's dsr value came from tsr and it is not outside expected range // => stop propagation so you don't overwrite it break; } // Update and move right $env->log( "trace/dsr", static function () use ( $frame, $newCE, $sibling, $siblingDP ) { return " CHANGING ce.start of " . DOMCompat::nodeName( $sibling ) . " from " . $siblingDP->dsr->start . " to " . $newCE; } ); $siblingDP->dsr->start = $newCE; // If we have a dsr->end as well and since we updated // dsr->start, we have to ensure that the two values don't // introduce an inconsistency where dsr->start > dsr->end. // Since we are in a LTR pass and are pushing updates // forward, we are resolving it by updating dsr->end as // well. There could be scenarios where this would be // incorrect, but there is no universal fix here. if ( $siblingDP->dsr->end !== null && $newCE > $siblingDP->dsr->end ) { $siblingDP->dsr->end = $newCE; } $newCE = $siblingDP->dsr->end; } else { break; } $sibling = $sibling->nextSibling; } // Propagate new end information if ( !$sibling ) { $e = $newCE; } } } // Don't change state if we processed a fostered $node if ( $fosteredNode ) { $ce = $origCE; } else { // $ce for next $child = $cs of current $child $ce = $cs; } $child = $prevChild; } if ( $cs === null ) { $cs = $s; } // Detect errors if ( $s !== null && $cs !== $s && !$this->acceptableInconsistency( $opts, $node, $cs, $s ) ) { $env->log( "info/dsr/inconsistent", "DSR inconsistency: cs/s mismatch for node:", DOMCompat::nodeName( $node ), "s:", $s, "; cs:", $cs ); } $this->trace( $env, "END: ", DOMCompat::nodeName( $node ), ", returning: ", $cs, ", ", $e ); return [ $cs, $e ]; } /** * Computes DSR ranges for every node of a DOM tree. * This pass is only invoked on the top-level page. * * @param Env $env The environment/context for the parse pipeline * @param Node $root The root of the tree for which DSR has to be computed * @param array $options Options governing DSR computation * - sourceOffsets: [start, end] source offset. If missing, this defaults to * [0, strlen($frame->getSrcText())] * - attrExpansion: Is this an attribute expansion pipeline? * @param bool $atTopLevel Are we running this on the top level? */ public function run( Env $env, Node $root, array $options = [], bool $atTopLevel = false ): void { // Don't run this in template content if ( $options['inTemplate'] ) { return; } $frame = $options['frame'] ?? $env->topFrame; $startOffset = $options['sourceOffsets']->start ?? 0; $endOffset = $options['sourceOffsets']->end ?? strlen( $frame->getSrcText() ); $env->log( "trace/dsr", "------- tracing DSR computation -------" ); // The actual computation buried in trace/debug stmts. $opts = [ 'attrExpansion' => $options['attrExpansion'] ?? false ]; $this->computeNodeDSR( $frame, $root, $startOffset, $endOffset, 0, $opts ); if ( $root instanceof Element ) { $dp = DOMDataUtils::getDataParsoid( $root ); $dp->dsr = new DomSourceRange( $startOffset, $endOffset, 0, 0 ); } $env->log( "trace/dsr", "------- done tracing computation -------" ); } } PK ! ��kB� � CompoundTemplateInfo.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; use Wikimedia\Parsoid\Core\DomSourceRange; use Wikimedia\Parsoid\NodeData\TemplateInfo; class CompoundTemplateInfo { /** @var DomSourceRange */ public $dsr; /** @var TemplateInfo */ public $info; /** @var bool */ public $isParam; public function __construct( DomSourceRange $dsr, TemplateInfo $info, bool $isParam ) { $this->dsr = $dsr; $this->info = $info; $this->isParam = $isParam; } } PK ! iЦ� MigrateTrailingNLs.phpnu �Iw�� <?php declare( strict_types = 1 ); namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; use Wikimedia\Parsoid\Config\Env; use Wikimedia\Parsoid\DOM\Comment; use Wikimedia\Parsoid\DOM\DocumentFragment; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\DOM\Text; use Wikimedia\Parsoid\NodeData\DataParsoid; use Wikimedia\Parsoid\Utils\DOMCompat; use Wikimedia\Parsoid\Utils\DOMDataUtils; use Wikimedia\Parsoid\Utils\DOMUtils; use Wikimedia\Parsoid\Utils\PHPUtils; use Wikimedia\Parsoid\Utils\WTUtils; use Wikimedia\Parsoid\Wt2Html\Wt2HtmlDOMProcessor; class MigrateTrailingNLs implements Wt2HtmlDOMProcessor { private static $nodesToMigrateFrom; private function nodeEndsLineInWT( Node $node, DataParsoid $dp ): bool { // These nodes either end a line in wikitext (tr, li, dd, ol, ul, dl, caption, // p) or have implicit closing tags that can leak newlines to those that end a // line (th, td) // // SSS FIXME: Given condition 2, we may not need to check th/td anymore // (if we can rely on auto inserted start/end tags being present always). self::$nodesToMigrateFrom ??= PHPUtils::makeSet( [ 'pre', 'th', 'td', 'tr', 'li', 'dd', 'ol', 'ul', 'dl', 'caption', 'p' ] ); return isset( self::$nodesToMigrateFrom[DOMCompat::nodeName( $node )] ) && !WTUtils::hasLiteralHTMLMarker( $dp ); } private function getTableParent( Node $node ): ?Node { $nodeName = DOMCompat::nodeName( $node ); if ( in_array( $nodeName, [ 'td', 'th' ], true ) ) { $node = $node->parentNode; $nodeName = DOMCompat::nodeName( $node ); } if ( $nodeName === 'tr' ) { $node = $node->parentNode; $nodeName = DOMCompat::nodeName( $node ); } if ( in_array( $nodeName, [ 'tbody', 'thead', 'tfoot', 'caption' ], true ) ) { $node = $node->parentNode; $nodeName = DOMCompat::nodeName( $node ); } return ( $nodeName === 'table' ) ? $node : null; } /** * We can migrate a newline out of a node if one of the following is true: * (1) The node ends a line in wikitext (=> not a literal html tag) * (2) The node has an auto-closed end-tag (wikitext-generated or literal html tag) * and hasn't been fostered out of a table. * (3) It is the rightmost node in the DOM subtree rooted at a node * that ends a line in wikitext * @param Node $node * @return bool */ private function canMigrateNLOutOfNode( Node $node ): bool { if ( DOMCompat::nodeName( $node ) === 'table' || DOMUtils::atTheTop( $node ) ) { return false; } // Don't allow migration out of a table if the table has had // content fostered out of it. $tableParent = $this->getTableParent( $node ); if ( $tableParent && $tableParent->previousSibling instanceof Element ) { $previousSibling = $tableParent->previousSibling; '@phan-var Element $previousSibling'; // @var Element $previousSibling if ( !empty( DOMDataUtils::getDataParsoid( $previousSibling )->fostered ) ) { return false; } } DOMUtils::assertElt( $node ); $dp = DOMDataUtils::getDataParsoid( $node ); return empty( $dp->fostered ) && ( $this->nodeEndsLineInWT( $node, $dp ) || !empty( $dp->autoInsertedEnd ) || ( !$node->nextSibling && // FIXME: bug compatibility, previously the end meta caused // $node->nextSibling to be true for elements with end tags empty( $dp->tmp->endTSR ) && $node->parentNode && $this->canMigrateNLOutOfNode( $node->parentNode ) ) ); } /** * A node has zero wt width if: * - tsr->start == tsr->end * - only has children with zero wt width * @param Element $node * @return bool */ private function hasZeroWidthWT( Element $node ): bool { $tsr = DOMDataUtils::getDataParsoid( $node )->tsr ?? null; if ( !$tsr || $tsr->start === null || $tsr->start !== $tsr->end ) { return false; } $c = $node->firstChild; while ( $c instanceof Element && $this->hasZeroWidthWT( $c ) ) { $c = $c->nextSibling; } return $c === null; } public function doMigrateTrailingNLs( Node $elt, Env $env ): void { if ( !( $elt instanceof Element ) && !( $elt instanceof DocumentFragment ) ) { return; } // 1. Process DOM rooted at 'elt' first // // Process children backward so that a table // is processed before its fostered content. // See subtle changes in newline migration with this wikitext: // "<table>\n<tr> || ||\n<td> a\n</table>" // when walking backward vs. forward. // // Separately, walking backward also lets us ignore // newly added children after child (because of // migrated newline nodes from child's DOM tree). $child = $elt->lastChild; while ( $child !== null ) { $this->doMigrateTrailingNLs( $child, $env ); $child = $child->previousSibling; } // 2. Process 'elt' itself after -- skip literal-HTML nodes if ( $this->canMigrateNLOutOfNode( $elt ) ) { $firstEltToMigrate = null; $migrationBarrier = null; $partialContent = false; $n = $elt->lastChild; // We can migrate trailing newlines across nodes that have zero-wikitext-width. while ( $n instanceof Element && $this->hasZeroWidthWT( $n ) ) { $migrationBarrier = $n; $n = $n->previousSibling; } $isTdTh = DOMCompat::nodeName( $elt ) === 'td' || DOMCompat::nodeName( $elt ) === 'th'; // Find nodes that need to be migrated out: // - a sequence of comment and newline nodes that is preceded by // a non-migratable node (text node with non-white-space content // or an element node). $foundNL = false; $tsrCorrection = 0; while ( $n instanceof Text || $n instanceof Comment ) { if ( $n instanceof Comment ) { if ( $isTdTh ) { break; } $firstEltToMigrate = $n; $tsrCorrection += WTUtils::decodedCommentLength( $n ); } else { if ( !$isTdTh && preg_match( '/^[ \t\r\n]*\n[ \t\r\n]*$/D', $n->nodeValue ) ) { $foundNL = true; $firstEltToMigrate = $n; $partialContent = false; // all whitespace is moved $tsrCorrection += strlen( $n->nodeValue ); } elseif ( str_ends_with( $n->nodeValue, "\n" ) ) { $foundNL = true; $firstEltToMigrate = $n; $partialContent = true; // only newlines moved preg_match( '/\n+$/D', $n->nodeValue, $matches ); $tsrCorrection += strlen( $matches[0] ?? '' ); break; } else { break; } } $n = $n->previousSibling; } if ( $firstEltToMigrate && $foundNL ) { $eltParent = $elt->parentNode; $insertPosition = $elt->nextSibling; $n = $firstEltToMigrate; while ( $n !== $migrationBarrier ) { $next = $n->nextSibling; if ( $partialContent ) { $nls = $n->nodeValue; $n->nodeValue = preg_replace( '/\n+$/D', '', $n->nodeValue, 1 ); $nls = substr( $nls, strlen( $n->nodeValue ) ); $n = $n->ownerDocument->createTextNode( $nls ); $partialContent = false; } $eltParent->insertBefore( $n, $insertPosition ); $n = $next; } // Adjust tsr of any nodes after migrationBarrier. // Ex: zero-width nodes that have valid tsr on them // By definition (zero-width), these are synthetic nodes added by Parsoid // that aren't present in the original wikitext. $n = $migrationBarrier; while ( $n ) { // TSR is guaranteed to exist and be valid // (checked by hasZeroWidthWT above) DOMUtils::assertElt( $n ); $dp = DOMDataUtils::getDataParsoid( $n ); $dp->tsr = $dp->tsr->offset( -$tsrCorrection ); $n = $n->nextSibling; } } } } /** * @inheritDoc */ public function run( Env $env, Node $root, array $options = [], bool $atTopLevel = false ): void { $this->doMigrateTrailingNLs( $root, $env ); } } PK ! ��4LO� O� DOMRangeBuilder.phpnu �Iw�� PK ! ��$ $ �� PWrap.phpnu �Iw�� PK ! �,� � �� WrapSections.phpnu �Iw�� PK ! �0��� � �� DOMRangeInfo.phpnu �Iw�� PK ! &�_�� � �� ProcessEmbeddedDocs.phpnu �Iw�� PK ! �cjE]� ]� �� Linter.phpnu �Iw�� PK ! �Çcn$ n$ �� AddMetaData.phpnu �Iw�� PK ! �F=� � 8� WrapAnnotations.phpnu �Iw�� PK ! W�%k k v� MigrateTemplateMarkerMetas.phpnu �Iw�� PK ! ��� � /� ProcessTreeBuilderFixups.phpnu �Iw�� PK ! >��B�f �f ?� AddMediaInfo.phpnu �Iw�� PK ! ��v� � +\ UpdateTemplateOutput.phpnu �Iw�� PK ! �f��#5 #5 oh AnnotationDOMRangeBuilder.phpnu �Iw�� PK ! Re4[~ [~ ߝ WrapSectionsState.phpnu �Iw�� PK ! ����� � PWrapState.phpnu �Iw�� PK ! �ǐ�: : �% WrapSectionsTplInfo.phpnu �Iw�� PK ! �2�nx+ x+ ( MarkFosteredContent.phpnu �Iw�� PK ! zƇss s �S Normalize.phpnu �Iw�� PK ! �%v�� � xV ConvertOffsets.phpnu �Iw�� PK ! ��t =\ LangConverter.phpnu �Iw�� PK ! H�Dg� � �_ RunExtensionProcessors.phpnu �Iw�� PK ! ��$ $ gq WrapTemplates.phpnu �Iw�� PK ! �ip�2 2 �t Section.phpnu �Iw�� PK ! �xѧ � 9{ DOMRangeInfoArray.phpnu �Iw�� PK ! ��C� � %| AddRedLinks.phpnu �Iw�� PK ! ��_pE E � RangeBuilderException.phpnu �Iw�� PK ! �ܶ d d �� ComputeDSR.phpnu �Iw�� PK ! ��kB� � � CompoundTemplateInfo.phpnu �Iw�� PK ! iЦ� MigrateTrailingNLs.phpnu �Iw�� PK n b
| ver. 1.1 | |
.
| PHP 8.4.18 | Ð“ÐµÐ½ÐµÑ€Ð°Ñ†Ð¸Ñ Ñтраницы: 0.07 |
proxy
|
phpinfo
|
ÐаÑтройка