Hi,
I have a search function that searched a DB and returns the data, that works great. BUT, In the DB i store the info as layout, so all the data come back all formatted with html, p’s and div’s and even some tables in the older content.
I have searched and found this code posted by Bokeh and NogDog years ago but it doesn’t strip everything. I just want the plain text outside of <> tags.
Any Ideas
[url]http://www.asktom.co.nz/search
[code=php]
function replace_links($text)
# convert HTML links to textual representations
# “<a href=”http:/a.b.com/”>test</a>” -> “test (http:/a.b.com/)”
{
# define regexp components for main regexp:
$start = ‘<as[^>]*href=’; # start of A link
$mail_q = ‘[‘”]mailto:([^'”]+)[‘”]’; # quoted mailto
$mail_u = ‘mailto:([^s>]+)’; # unquoted mailto
$link_q = ‘[‘”](h?[ft]tp:[^'”]+)[‘”]’; # quoted http or ftp link
$link_u = ‘(h?[ft]tp:[^s>]+)’; # unquoted http or ftp link
$end = ‘[^>]*>(.+)</a>’; # end of A link
$search = array(“/$start(?:$mail_q|$mail_u|$link_q|$link_u)$end/i”,
‘/<as[^>]*>(.*)</a>/i’); # local file or other non-match
$replace = array(‘5 (1234)’, ‘1’);
return(preg_replace($search, $replace, $text));
}
function CleanUp($input)
{
// list of allowed tags. Edit to taste
define(‘__HTML__’, ‘a|b|br|i|img|p|span’);
// list of allowed attributes. Edit to taste
define(‘__ATTRIBUTES__’, ‘src|alt|href|title|class’);
if(!function_exists(‘DisallowedTagsCallback’))
{
function DisallowedTagsCallback($input)
{
$input[0] = strip_tags($input[0]);
return htmlentities($input[0]);
}
}
if(!function_exists(‘DisallowedAttributesCallback’))
{
function DisallowedAttributesCallback($input)
{
$regex = ‘/s*b(?!(?:’.__ATTRIBUTES__.’))[a-z]+bs*[=]s*([‘”])’.
‘(((?!1).)|((?<=[\\])1))*1/is’;
return preg_replace($regex, ”, $input[0]);
}
}
// strip out any javascript ( <script>, onclick etc, and href=”javascript:” )
$regex = array(‘/<scriptb[^>]*>((?!</scriptb[^>]*>).)*</scriptb[^>]*>/is’,
‘/s*bon[a-z]+s*[=]s*([“‘])(((?!1)[^\])|((?<!\\)(?:\\\\)*\\1)|(?!(?:\\)*1)\\)*1/i’,
‘/href+s*[=]s*([“‘])((?!1).)*javascript((?!1).)*1/is’);
$replace = array(”, ”, ‘href=”#”‘);
$input = preg_replace($regex, $replace, $input);
// strip disallowed tags
$regex= ” @(((?<=^)|(?<=[>]))(?![<]/?(“.__HTML__.”)b[^>]*[>])”.
“([^<]|((?![<]/?(“.__HTML__.”)b[^>]*[>])[<]))+)@i”;
$input = preg_replace_callback($regex, ‘DisallowedTagsCallback’, $input);
// strip disallowed attributes
$regex = ‘/(?<=[<])[^>]+(?=[>])/’;
return preg_replace_callback($regex, ‘DisallowedAttributesCallback’, $input);
}
function bold($tag,$line){
$line = replace_links($line);
$line = CleanUp($line);
$line = htmlentities($line);
$line = substr(str_replace($tag, “<strong class=”highlight”>”.$tag.”</strong>”, $line), 30);
return $line;
}
Thanks for taking the time