Hi, I was very kindly written the script below by Mindzai. I am now trying to write a function that does the opposite, so far I have not been too successful. The function takes a string of text, it then converts all entities in the text to there special html codes whilst leaving out the tags in the text. I want to achieve the opposite of this. My attempt is not great, it converts all entities until it comes to a tag, all the entities after that are then left.
This is the original function:
[code=php]function clean_entities($string) {
// Map of windows 1252 chracter points to utf-8 character points
$cp1252_map = array(
“xc2x80” => “xe2x82xac”, /* EURO SIGN */
“xc2x82” => “xe2x80x9a”, /* SINGLE LOW-9 QUOTATION MARK */
“xc2x83” => “xc6x92”, /* LATIN SMALL LETTER F WITH HOOK */
“xc2x84” => “xe2x80x9e”, /* DOUBLE LOW-9 QUOTATION MARK */
“xc2x85” => “xe2x80xa6”, /* HORIZONTAL ELLIPSIS */
“xc2x86” => “xe2x80xa0”, /* DAGGER */
“xc2x87” => “xe2x80xa1”, /* DOUBLE DAGGER */
“xc2x88” => “xcbx86”, /* MODIFIER LETTER CIRCUMFLEX ACCENT */
“xc2x89” => “xe2x80xb0”, /* PER MILLE SIGN */
“xc2x8a” => “xc5xa0”, /* LATIN CAPITAL LETTER S WITH CARON */
“xc2x8b” => “xe2x80xb9”, /* SINGLE LEFT-POINTING ANGLE QUOTATION */
“xc2x8c” => “xc5x92”, /* LATIN CAPITAL LIGATURE OE */
“xc2x8e” => “xc5xbd”, /* LATIN CAPITAL LETTER Z WITH CARON */
“xc2x91” => “xe2x80x98”, /* LEFT SINGLE QUOTATION MARK */
“xc2x92” => “xe2x80x99”, /* RIGHT SINGLE QUOTATION MARK */
“xc2x93” => “xe2x80x9c”, /* LEFT DOUBLE QUOTATION MARK */
“xc2x94” => “xe2x80x9d”, /* RIGHT DOUBLE QUOTATION MARK */
“xc2x95” => “xe2x80xa2”, /* BULLET */
“xc2x96” => “xe2x80x93”, /* EN DASH */
“xc2x97” => “xe2x80x94”, /* EM DASH */
“xc2x98” => “xcbx9c”, /* SMALL TILDE */
“xc2x99” => “xe2x84xa2”, /* TRADE MARK SIGN */
“xc2x9a” => “xc5xa1”, /* LATIN SMALL LETTER S WITH CARON */
“xc2x9b” => “xe2x80xba”, /* SINGLE RIGHT-POINTING ANGLE QUOTATION*/
“xc2x9c” => “xc5x93”, /* LATIN SMALL LIGATURE OE */
“xc2x9e” => “xc5xbe”, /* LATIN SMALL LETTER Z WITH CARON */
“xc2x9f” => “xc5xb8” /* LATIN CAPITAL LETTER Y WITH DIAERESIS*/
);
// Map of utf-8 chracter points to special html entities
$ent_map = array(
“xe2x80x98” => ‘‘’,
“xe2x80x99” => ‘’’,
“xe2x80x9c” => ‘“’,
“xe2x80x9d” => ‘”’,
“xe2x82xac” => ‘€’
);
$string = trim($string);
// apply the windows > utf8 map
$string = str_replace(array_keys($cp1252_map), $cp1252_map, $string);
// get rid of any existing html entities to avoid double encoding
$string = html_entity_decode($string, ENT_QUOTES, ‘UTF-8’);
// break out any PHP sections since they should not be touched
$parts = preg_split(‘/(<?.+??>)/us’, $string, -1, PREG_SPLIT_DELIM_CAPTURE);
// replace &, “, ‘, < and > with their entities, but only where they are not
// inside an html tag
$string = ”;
foreach ($parts as $part) {
if (false === mb_strpos(trim($part), ‘<?’)) {
$string .= preg_replace_callback(
‘/(?<=>)((?![<](?|/)*[a-z][^>]*[>]).)+/is’,
create_function(
‘$matches’,
‘return htmlentities($matches[0]);’
),
$part
);
} else {
$string .= $part;
}
}
// apply the utf-8 > entities map
$string = str_replace(array_keys($ent_map), $ent_map, $string);
// trim whitespace from the end of each line and add a nice n
// tinymce in particular seems to have a bug where it will insert spaces
// at the end of lines.
$parts = preg_split(“/[rn]+/u”, $string);
foreach ($parts as &$part) {
$part = rtrim($part);
}
$string = implode(“n”, $parts);
return $string;
}
This is my attempt to reverse it:
[code=php]function remove_entities($string) {
// Map of windows 1252 chracter points to utf-8 character points
$cp1252_map = array(
“xe2x82xac” => “xc2x80”, /* EURO SIGN */
“xe2x80x9a” => “xc2x82”, /* SINGLE LOW-9 QUOTATION MARK */
“xc6x92” => “xc2x83”, /* LATIN SMALL LETTER F WITH HOOK */
“xe2x80x9e” => “xc2x84”, /* DOUBLE LOW-9 QUOTATION MARK */
“xe2x80xa6” => “xc2x85”, /* HORIZONTAL ELLIPSIS */
“xe2x80xa0” => “xc2x86”, /* DAGGER */
“xe2x80xa1” => “xc2x87”, /* DOUBLE DAGGER */
“xcbx86” => “xc2x88”, /* MODIFIER LETTER CIRCUMFLEX ACCENT */
“xe2x80xb0” => “xc2x89”, /* PER MILLE SIGN */
“xc5xa0” => “xc2x8a”, /* LATIN CAPITAL LETTER S WITH CARON */
“xe2x80xb9” => “xc2x8b”, /* SINGLE LEFT-POINTING ANGLE QUOTATION */
“xc5x92” => “xc2x8c”, /* LATIN CAPITAL LIGATURE OE */
“xc5xbd” => “xc2x8e”, /* LATIN CAPITAL LETTER Z WITH CARON */
“xe2x80x98” => “xc2x91”, /* LEFT SINGLE QUOTATION MARK */
“xe2x80x99” => “xc2x92”, /* RIGHT SINGLE QUOTATION MARK */
“xe2x80x9c” => “xc2x93”, /* LEFT DOUBLE QUOTATION MARK */
“xe2x80x9d” => “xc2x94”, /* RIGHT DOUBLE QUOTATION MARK */
“xe2x80xa2” => “xc2x95”, /* BULLET */
“xe2x80x93” => “xc2x96”, /* EN DASH */
“xe2x80x94” => “xc2x97”, /* EM DASH */
“xcbx9c” => “xc2x98”, /* SMALL TILDE */
“xe2x84xa2” => “xc2x99”, /* TRADE MARK SIGN */
“xc5xa1” => “xc2x9a”, /* LATIN SMALL LETTER S WITH CARON */
“xe2x80xba” => “xc2x9b”, /* SINGLE RIGHT-POINTING ANGLE QUOTATION*/
“xc5x93” => “xc2x9c”, /* LATIN SMALL LIGATURE OE */
“xc5xbe” => “xc2x9e”, /* LATIN SMALL LETTER Z WITH CARON */
“xc5xb8” => “xc2x9f” /* LATIN CAPITAL LETTER Y WITH DIAERESIS*/
);
// Map of utf-8 chracter points to special html entities
$ent_map = array(
‘‘’ => “xe2x80x98”,
‘’’ => “xe2x80x99”,
‘“’ => “xe2x80x9c”,
‘”’ => “xe2x80x9d”,
‘€’ => “xe2x82xac”
);
$string = trim($string);
// apply the windows > utf8 map
$string = str_replace(array_keys($cp1252_map), $cp1252_map, $string);
// get rid of any existing html entities to avoid double encoding
$string = htmlentities($string, ENT_QUOTES, ‘UTF-8’);
// break out any PHP sections since they should not be touched
$parts = preg_split(‘/(<?.+??>)/us’, $string, -1, PREG_SPLIT_DELIM_CAPTURE);
// replace &, “, ‘, < and > with their entities, but only where they are not
// inside an html tag
$string = ”;
foreach ($parts as $part) {
if (false === mb_strpos(trim($part), ‘<?’)) {
$string .= preg_replace_callback(
‘/(?<=>)((?![<](?|/)*[a-z][^>]*[>]).)+/is’,
create_function(
‘$matches’,
‘return html_entity_decode($matches[0]);’
),
$part
);
} else {
$string .= $part;
}
}
// apply the utf-8 > entities map
$string = str_replace(array_keys($ent_map), $ent_map, $string);
return $string;
}