parser.php
2.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
<?php
/**
* @package Joomla.Administrator
* @subpackage com_finder
*
* @copyright Copyright (C) 2005 - 2019 Open Source Matters, Inc. All rights reserved.
* @license GNU General Public License version 2 or later; see LICENSE.txt
*/
defined('_JEXEC') or die;
/**
* Parser base class for the Finder indexer package.
*
* @since 2.5
*/
abstract class FinderIndexerParser
{
/**
* Method to get a parser, creating it if necessary.
*
* @param string $format The type of parser to load.
*
* @return FinderIndexerParser A FinderIndexerParser instance.
*
* @since 2.5
* @throws Exception on invalid parser.
*/
public static function getInstance($format)
{
static $instances;
// Only create one parser for each format.
if (isset($instances[$format]))
{
return $instances[$format];
}
// Create an array of instances if necessary.
if (!is_array($instances))
{
$instances = array();
}
// Setup the adapter for the parser.
$format = JFilterInput::getInstance()->clean($format, 'cmd');
$path = __DIR__ . '/parser/' . $format . '.php';
$class = 'FinderIndexerParser' . ucfirst($format);
// Check if a parser exists for the format.
if (!file_exists($path))
{
// Throw invalid format exception.
throw new Exception(JText::sprintf('COM_FINDER_INDEXER_INVALID_PARSER', $format));
}
// Instantiate the parser.
JLoader::register($class, $path);
$instances[$format] = new $class;
return $instances[$format];
}
/**
* Method to parse input and extract the plain text. Because this method is
* called from both inside and outside the indexer, it needs to be able to
* batch out its parsing functionality to deal with the inefficiencies of
* regular expressions. We will parse recursively in 2KB chunks.
*
* @param string $input The input to parse.
*
* @return string The plain text input.
*
* @since 2.5
*/
public function parse($input)
{
// If the input is less than 2KB we can parse it in one go.
if (strlen($input) <= 2048)
{
return $this->process($input);
}
// Input is longer than 2Kb so parse it in chunks of 2Kb or less.
$start = 0;
$end = strlen($input);
$chunk = 2048;
$return = null;
while ($start < $end)
{
// Setup the string.
$string = substr($input, $start, $chunk);
// Find the last space character if we aren't at the end.
$ls = (($start + $chunk) < $end ? strrpos($string, ' ') : false);
// Truncate to the last space character.
if ($ls !== false)
{
$string = substr($string, 0, $ls);
}
// Adjust the start position for the next iteration.
$start += ($ls !== false ? ($ls + 1 - $chunk) + $chunk : $chunk);
// Parse the chunk.
$return .= $this->process($string);
}
return $return;
}
/**
* Method to process input and extract the plain text.
*
* @param string $input The input to process.
*
* @return string The plain text input.
*
* @since 2.5
*/
abstract protected function process($input);
}