html.php
4.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
<?php
/**
* @package Joomla.Administrator
* @subpackage com_finder
*
* @copyright Copyright (C) 2005 - 2019 Open Source Matters, Inc. All rights reserved.
* @license GNU General Public License version 2 or later; see LICENSE.txt
*/
defined('_JEXEC') or die;
JLoader::register('FinderIndexerParser', dirname(__DIR__) . '/parser.php');
/**
* HTML Parser class for the Finder indexer package.
*
* @since 2.5
*/
class FinderIndexerParserHtml extends FinderIndexerParser
{
/**
* Method to parse input and extract the plain text. Because this method is
* called from both inside and outside the indexer, it needs to be able to
* batch out its parsing functionality to deal with the inefficiencies of
* regular expressions. We will parse recursively in 2KB chunks.
*
* @param string $input The input to parse.
*
* @return string The plain text input.
*
* @since 2.5
*/
public function parse($input)
{
// Strip invalid UTF-8 characters.
$input = iconv('utf-8', 'utf-8//IGNORE', $input);
// Remove anything between <head> and </head> tags. Do this first
// because there might be <script> or <style> tags nested inside.
$input = $this->removeBlocks($input, '<head>', '</head>');
// Convert <style> and <noscript> tags to <script> tags
// so we can remove them efficiently.
$search = array(
'<style', '</style',
'<noscript', '</noscript',
);
$replace = array(
'<script', '</script',
'<script', '</script',
);
$input = str_replace($search, $replace, $input);
// Strip all script blocks.
$input = $this->removeBlocks($input, '<script', '</script>');
// Decode HTML entities.
$input = html_entity_decode($input, ENT_QUOTES, 'UTF-8');
// Convert entities equivalent to spaces to actual spaces.
$input = str_replace(array(' ', ' '), ' ', $input);
// Add a space before both the OPEN and CLOSE tags of BLOCK and LINE BREAKING elements,
// e.g. 'all<h1><em>m</em>obile List</h1>' will become 'all mobile List'
$input = preg_replace('/(<|<\/)(' .
'address|article|aside|blockquote|br|canvas|dd|div|dl|dt|' .
'fieldset|figcaption|figure|footer|form|h1|h2|h3|h4|h5|h6|header|hgroup|hr|li|' .
'main|nav|noscript|ol|output|p|pre|section|table|tfoot|ul|video' .
')\b/i', ' $1$2', $input
);
// Strip HTML tags.
$input = strip_tags($input);
return parent::parse($input);
}
/**
* Method to process HTML input and extract the plain text.
*
* @param string $input The input to process.
*
* @return string The plain text input.
*
* @since 2.5
*/
protected function process($input)
{
// Replace any amount of white space with a single space.
return preg_replace('#\s+#u', ' ', $input);
}
/**
* Method to remove blocks of text between a start and an end tag.
* Each block removed is effectively replaced by a single space.
*
* Note: The start tag and the end tag must be different.
* Note: Blocks must not be nested.
* Note: This method will function correctly with multi-byte strings.
*
* @param string $input String to be processed.
* @param string $startTag String representing the start tag.
* @param string $endTag String representing the end tag.
*
* @return string with blocks removed.
*
* @since 3.4
*/
private function removeBlocks($input, $startTag, $endTag)
{
$return = '';
$offset = 0;
$startTagLength = strlen($startTag);
$endTagLength = strlen($endTag);
// Find the first start tag.
$start = stripos($input, $startTag);
// If no start tags were found, return the string unchanged.
if ($start === false)
{
return $input;
}
// Look for all blocks defined by the start and end tags.
while ($start !== false)
{
// Accumulate the substring up to the start tag.
$return .= substr($input, $offset, $start - $offset) . ' ';
// Look for an end tag corresponding to the start tag.
$end = stripos($input, $endTag, $start + $startTagLength);
// If no corresponding end tag, leave the string alone.
if ($end === false)
{
// Fix the offset so part of the string is not duplicated.
$offset = $start;
break;
}
// Advance the start position.
$offset = $end + $endTagLength;
// Look for the next start tag and loop.
$start = stripos($input, $startTag, $offset);
}
// Add in the final substring after the last end tag.
$return .= substr($input, $offset);
return $return;
}
}