Update v1.0.6

This commit is contained in:
Bhanu Slathia
2016-02-16 23:22:09 +05:30
parent 62d04a0372
commit c710c20b9e
7620 changed files with 244752 additions and 1070312 deletions

View File

@@ -1,61 +0,0 @@
forceutf8
=========
PHP Class Encoding featuring popular \ForceUTF8\Encoding::toUTF8() function --formerly known as forceUTF8()-- that fixes mixed encoded strings.
Description
===========
If you apply the PHP function utf8_encode() to an already-UTF8 string it will return a garbled UTF8 string.
This class addresses this issue and provides a handy static function called \ForceUTF8\Encoding::toUTF8().
You don't need to know what the encoding of your strings is. It can be Latin1 (iso 8859-1), Windows-1252 or UTF8, or the string can have a mix of them. \ForceUTF8\Encoding::toUTF8() will convert everything to UTF8.
Sometimes you have to deal with services that are unreliable in terms of encoding, possibly mixing UTF8 and Latin1 in the same string.
Update:
I've included another function, \ForceUTF8\Encoding::fixUTF8(), which will fix the double (or multiple) encoded UTF8 string that looks garbled.
Usage:
======
use \ForceUTF8\Encoding;
$utf8_string = Encoding::toUTF8($utf8_or_latin1_or_mixed_string);
$latin1_string = Encoding::toLatin1($utf8_or_latin1_or_mixed_string);
also:
$utf8_string = Encoding::fixUTF8($garbled_utf8_string);
Examples:
use \ForceUTF8\Encoding;
echo Encoding::fixUTF8("Fédération Camerounaise de Football\n");
echo Encoding::fixUTF8("Fédération Camerounaise de Football\n");
echo Encoding::fixUTF8("Fédération Camerounaise de Football\n");
echo Encoding::fixUTF8("Fédération Camerounaise de Football\n");
will output:
Fédération Camerounaise de Football
Fédération Camerounaise de Football
Fédération Camerounaise de Football
Fédération Camerounaise de Football
Install via composer:
=====================
Edit your composer.json file to include the following:
```json
{
"require": {
"neitanod/forceutf8": "dev-master"
}
}
```

View File

@@ -1,20 +0,0 @@
{
"name": "neitanod/forceutf8",
"homepage": "https://github.com/neitanod/forceutf8",
"type": "library",
"description": "PHP Class Encoding featuring popular Encoding::toUTF8() function --formerly known as forceUTF8()-- that fixes mixed encoded strings.",
"require": {
"php": ">=5.3.0"
},
"authors": [
{
"name": "Sebastián Grignoli",
"email": "grignoli@gmail.com"
}
],
"autoload": {
"psr-0": {
"ForceUTF8\\": "src/"
}
}
}

View File

@@ -1,347 +0,0 @@
<?php
/*
Copyright (c) 2008 Sebastián Grignoli
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of copyright holders nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS
BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
/**
* @author "Sebastián Grignoli" <grignoli@gmail.com>
* @package Encoding
* @version 2.0
* @link https://github.com/neitanod/forceutf8
* @example https://github.com/neitanod/forceutf8
* @license Revised BSD
*/
namespace ForceUTF8;
class Encoding {
const ICONV_TRANSLIT = "TRANSLIT";
const ICONV_IGNORE = "IGNORE";
const WITHOUT_ICONV = "";
protected static $win1252ToUtf8 = array(
128 => "\xe2\x82\xac",
130 => "\xe2\x80\x9a",
131 => "\xc6\x92",
132 => "\xe2\x80\x9e",
133 => "\xe2\x80\xa6",
134 => "\xe2\x80\xa0",
135 => "\xe2\x80\xa1",
136 => "\xcb\x86",
137 => "\xe2\x80\xb0",
138 => "\xc5\xa0",
139 => "\xe2\x80\xb9",
140 => "\xc5\x92",
142 => "\xc5\xbd",
145 => "\xe2\x80\x98",
146 => "\xe2\x80\x99",
147 => "\xe2\x80\x9c",
148 => "\xe2\x80\x9d",
149 => "\xe2\x80\xa2",
150 => "\xe2\x80\x93",
151 => "\xe2\x80\x94",
152 => "\xcb\x9c",
153 => "\xe2\x84\xa2",
154 => "\xc5\xa1",
155 => "\xe2\x80\xba",
156 => "\xc5\x93",
158 => "\xc5\xbe",
159 => "\xc5\xb8"
);
protected static $brokenUtf8ToUtf8 = array(
"\xc2\x80" => "\xe2\x82\xac",
"\xc2\x82" => "\xe2\x80\x9a",
"\xc2\x83" => "\xc6\x92",
"\xc2\x84" => "\xe2\x80\x9e",
"\xc2\x85" => "\xe2\x80\xa6",
"\xc2\x86" => "\xe2\x80\xa0",
"\xc2\x87" => "\xe2\x80\xa1",
"\xc2\x88" => "\xcb\x86",
"\xc2\x89" => "\xe2\x80\xb0",
"\xc2\x8a" => "\xc5\xa0",
"\xc2\x8b" => "\xe2\x80\xb9",
"\xc2\x8c" => "\xc5\x92",
"\xc2\x8e" => "\xc5\xbd",
"\xc2\x91" => "\xe2\x80\x98",
"\xc2\x92" => "\xe2\x80\x99",
"\xc2\x93" => "\xe2\x80\x9c",
"\xc2\x94" => "\xe2\x80\x9d",
"\xc2\x95" => "\xe2\x80\xa2",
"\xc2\x96" => "\xe2\x80\x93",
"\xc2\x97" => "\xe2\x80\x94",
"\xc2\x98" => "\xcb\x9c",
"\xc2\x99" => "\xe2\x84\xa2",
"\xc2\x9a" => "\xc5\xa1",
"\xc2\x9b" => "\xe2\x80\xba",
"\xc2\x9c" => "\xc5\x93",
"\xc2\x9e" => "\xc5\xbe",
"\xc2\x9f" => "\xc5\xb8"
);
protected static $utf8ToWin1252 = array(
"\xe2\x82\xac" => "\x80",
"\xe2\x80\x9a" => "\x82",
"\xc6\x92" => "\x83",
"\xe2\x80\x9e" => "\x84",
"\xe2\x80\xa6" => "\x85",
"\xe2\x80\xa0" => "\x86",
"\xe2\x80\xa1" => "\x87",
"\xcb\x86" => "\x88",
"\xe2\x80\xb0" => "\x89",
"\xc5\xa0" => "\x8a",
"\xe2\x80\xb9" => "\x8b",
"\xc5\x92" => "\x8c",
"\xc5\xbd" => "\x8e",
"\xe2\x80\x98" => "\x91",
"\xe2\x80\x99" => "\x92",
"\xe2\x80\x9c" => "\x93",
"\xe2\x80\x9d" => "\x94",
"\xe2\x80\xa2" => "\x95",
"\xe2\x80\x93" => "\x96",
"\xe2\x80\x94" => "\x97",
"\xcb\x9c" => "\x98",
"\xe2\x84\xa2" => "\x99",
"\xc5\xa1" => "\x9a",
"\xe2\x80\xba" => "\x9b",
"\xc5\x93" => "\x9c",
"\xc5\xbe" => "\x9e",
"\xc5\xb8" => "\x9f"
);
static function toUTF8($text){
/**
* Function \ForceUTF8\Encoding::toUTF8
*
* This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8.
*
* It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1.
*
* It may fail to convert characters to UTF-8 if they fall into one of these scenarios:
*
* 1) when any of these characters: ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞß
* are followed by any of these: ("group B")
* ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶•¸¹º»¼½¾¿
* For example: %ABREPRESENT%C9%BB. «REPRESENTÉ»
* The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB)
* is also a valid unicode character, and will be left unchanged.
*
* 2) when any of these: àáâãäåæçèéêëìíîï are followed by TWO chars from group B,
* 3) when any of these: ðñòó are followed by THREE chars from group B.
*
* @name toUTF8
* @param string $text Any string.
* @return string The same string, UTF8 encoded
*
*/
if(is_array($text))
{
foreach($text as $k => $v)
{
$text[$k] = self::toUTF8($v);
}
return $text;
}
if(!is_string($text)) {
return $text;
}
$max = self::strlen($text);
$buf = "";
for($i = 0; $i < $max; $i++){
$c1 = $text{$i};
if($c1>="\xc0"){ //Should be converted to UTF8, if it's not UTF8 already
$c2 = $i+1 >= $max? "\x00" : $text{$i+1};
$c3 = $i+2 >= $max? "\x00" : $text{$i+2};
$c4 = $i+3 >= $max? "\x00" : $text{$i+3};
if($c1 >= "\xc0" & $c1 <= "\xdf"){ //looks like 2 bytes UTF8
if($c2 >= "\x80" && $c2 <= "\xbf"){ //yeah, almost sure it's UTF8 already
$buf .= $c1 . $c2;
$i++;
} else { //not valid UTF8. Convert it.
$cc1 = (chr(ord($c1) / 64) | "\xc0");
$cc2 = ($c1 & "\x3f") | "\x80";
$buf .= $cc1 . $cc2;
}
} elseif($c1 >= "\xe0" & $c1 <= "\xef"){ //looks like 3 bytes UTF8
if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf"){ //yeah, almost sure it's UTF8 already
$buf .= $c1 . $c2 . $c3;
$i = $i + 2;
} else { //not valid UTF8. Convert it.
$cc1 = (chr(ord($c1) / 64) | "\xc0");
$cc2 = ($c1 & "\x3f") | "\x80";
$buf .= $cc1 . $cc2;
}
} elseif($c1 >= "\xf0" & $c1 <= "\xf7"){ //looks like 4 bytes UTF8
if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf"){ //yeah, almost sure it's UTF8 already
$buf .= $c1 . $c2 . $c3 . $c4;
$i = $i + 3;
} else { //not valid UTF8. Convert it.
$cc1 = (chr(ord($c1) / 64) | "\xc0");
$cc2 = ($c1 & "\x3f") | "\x80";
$buf .= $cc1 . $cc2;
}
} else { //doesn't look like UTF8, but should be converted
$cc1 = (chr(ord($c1) / 64) | "\xc0");
$cc2 = (($c1 & "\x3f") | "\x80");
$buf .= $cc1 . $cc2;
}
} elseif(($c1 & "\xc0") == "\x80"){ // needs conversion
if(isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases
$buf .= self::$win1252ToUtf8[ord($c1)];
} else {
$cc1 = (chr(ord($c1) / 64) | "\xc0");
$cc2 = (($c1 & "\x3f") | "\x80");
$buf .= $cc1 . $cc2;
}
} else { // it doesn't need conversion
$buf .= $c1;
}
}
return $buf;
}
static function toWin1252($text, $option = self::WITHOUT_ICONV) {
if(is_array($text)) {
foreach($text as $k => $v) {
$text[$k] = self::toWin1252($v, $option);
}
return $text;
} elseif(is_string($text)) {
return static::utf8_decode($text, $option);
} else {
return $text;
}
}
static function toISO8859($text) {
return self::toWin1252($text);
}
static function toLatin1($text) {
return self::toWin1252($text);
}
static function fixUTF8($text, $option = self::WITHOUT_ICONV){
if(is_array($text)) {
foreach($text as $k => $v) {
$text[$k] = self::fixUTF8($v, $option);
}
return $text;
}
$last = "";
while($last <> $text){
$last = $text;
$text = self::toUTF8(static::utf8_decode($text, $option));
}
$text = self::toUTF8(static::utf8_decode($text, $option));
return $text;
}
static function UTF8FixWin1252Chars($text){
// If you received an UTF-8 string that was converted from Windows-1252 as it was ISO8859-1
// (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it.
// See: http://en.wikipedia.org/wiki/Windows-1252
return str_replace(array_keys(self::$brokenUtf8ToUtf8), array_values(self::$brokenUtf8ToUtf8), $text);
}
static function removeBOM($str=""){
if(substr($str, 0,3) == pack("CCC",0xef,0xbb,0xbf)) {
$str=substr($str, 3);
}
return $str;
}
protected static function strlen($text){
return (function_exists('mb_strlen') && ((int) ini_get('mbstring.func_overload')) & 2) ?
mb_strlen($text,'8bit') : strlen($text);
}
public static function normalizeEncoding($encodingLabel)
{
$encoding = strtoupper($encodingLabel);
$encoding = preg_replace('/[^a-zA-Z0-9\s]/', '', $encoding);
$equivalences = array(
'ISO88591' => 'ISO-8859-1',
'ISO8859' => 'ISO-8859-1',
'ISO' => 'ISO-8859-1',
'LATIN1' => 'ISO-8859-1',
'LATIN' => 'ISO-8859-1',
'UTF8' => 'UTF-8',
'UTF' => 'UTF-8',
'WIN1252' => 'ISO-8859-1',
'WINDOWS1252' => 'ISO-8859-1'
);
if(empty($equivalences[$encoding])){
return 'UTF-8';
}
return $equivalences[$encoding];
}
public static function encode($encodingLabel, $text)
{
$encodingLabel = self::normalizeEncoding($encodingLabel);
if($encodingLabel == 'ISO-8859-1') return self::toLatin1($text);
return self::toUTF8($text);
}
protected static function utf8_decode($text, $option)
{
if ($option == self::WITHOUT_ICONV || !function_exists('iconv')) {
$o = utf8_decode(
str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text))
);
} else {
$o = iconv("UTF-8", "Windows-1252" . ($option == self::ICONV_TRANSLIT ? '//TRANSLIT' : ($option == self::ICONV_IGNORE ? '//IGNORE' : '')), $text);
}
return $o;
}
}

View File

@@ -1,101 +0,0 @@
<?php
require_once(dirname(__FILE__)."/Test.class.php");
require_once(dirname(dirname(__FILE__))."/src/ForceUTF8/Encoding.php");
use \ForceUTF8\Encoding;
// Test the testing class itself.
Test::is("'yes' is true", 'yes', true);
Test::not("1 is not false", 1, false);
Test::identical("true is identical to true", true, true);
Test::true("1 is true", 1);
// ForceUTF8 tests.
Test::not("Source files must not use the same encoding before conversion.",
file_get_contents(dirname(__FILE__)."/data/test1.txt"),
file_get_contents(dirname(__FILE__)."/data/test1Latin.txt"));
Test::identical("Simple Encoding works.",
file_get_contents(dirname(__FILE__)."/data/test1.txt"),
Encoding::toUTF8(file_get_contents(dirname(__FILE__)."/data/test1Latin.txt")));
function test_arrays_are_different(){
$arr1 = array(
file_get_contents(dirname(__FILE__)."/data/test1Latin.txt"),
file_get_contents(dirname(__FILE__)."/data/test1.txt"),
file_get_contents(dirname(__FILE__)."/data/test1Latin.txt"));
$arr2 = array(
file_get_contents(dirname(__FILE__)."/data/test1.txt"),
file_get_contents(dirname(__FILE__)."/data/test1.txt"),
file_get_contents(dirname(__FILE__)."/data/test1.txt"));
return $arr1 != $arr2;
}
function test_encoding_of_arrays(){
$arr1 = array(
file_get_contents(dirname(__FILE__)."/data/test1Latin.txt"),
file_get_contents(dirname(__FILE__)."/data/test1.txt"),
file_get_contents(dirname(__FILE__)."/data/test1Latin.txt"));
$arr2 = array(
file_get_contents(dirname(__FILE__)."/data/test1.txt"),
file_get_contents(dirname(__FILE__)."/data/test1.txt"),
file_get_contents(dirname(__FILE__)."/data/test1.txt"));
return Encoding::toUTF8($arr1) == $arr2;
}
Test::true("Source arrays are different.", test_arrays_are_different());
Test::true("Encoding of array works.", test_encoding_of_arrays());
Test::identical("fixUTF8() maintains UTF-8 string.",
file_get_contents(dirname(__FILE__)."/data/test1.txt"),
Encoding::fixUTF8(file_get_contents(dirname(__FILE__)."/data/test1.txt")));
Test::not("An UTF-8 double encoded string differs from a correct UTF-8 string.",
file_get_contents(dirname(__FILE__)."/data/test1.txt"),
utf8_encode(file_get_contents(dirname(__FILE__)."/data/test1.txt")));
Test::identical("fixUTF8() reverts to UTF-8 a double encoded string.",
file_get_contents(dirname(__FILE__)."/data/test1.txt"),
Encoding::fixUTF8(utf8_encode(file_get_contents(dirname(__FILE__)."/data/test1.txt"))));
function test_double_encoded_arrays_are_different(){
$arr1 = array(
utf8_encode(file_get_contents(dirname(__FILE__)."/data/test1Latin.txt")),
utf8_encode(file_get_contents(dirname(__FILE__)."/data/test1.txt")),
utf8_encode(file_get_contents(dirname(__FILE__)."/data/test1Latin.txt")));
$arr2 = array(
file_get_contents(dirname(__FILE__)."/data/test1.txt"),
file_get_contents(dirname(__FILE__)."/data/test1.txt"),
file_get_contents(dirname(__FILE__)."/data/test1.txt"));
return $arr1 != $arr2;
}
function test_double_encoded_arrays_fix(){
$arr1 = array(
utf8_encode(file_get_contents(dirname(__FILE__)."/data/test1Latin.txt")),
utf8_encode(file_get_contents(dirname(__FILE__)."/data/test1.txt")),
utf8_encode(file_get_contents(dirname(__FILE__)."/data/test1Latin.txt")));
$arr2 = array(
file_get_contents(dirname(__FILE__)."/data/test1.txt"),
file_get_contents(dirname(__FILE__)."/data/test1.txt"),
file_get_contents(dirname(__FILE__)."/data/test1.txt"));
return Encoding::fixUTF8($arr1) == $arr2;
}
Test::true("Source arrays are different (fixUTF8).", test_double_encoded_arrays_are_different());
Test::true("Fixing of double encoded array works.", test_double_encoded_arrays_fix());
Test::identical("fixUTF8() Example 1 still working.",
Encoding::fixUTF8("Fédération Camerounaise de Football\n"),
"Fédération Camerounaise de Football\n");
Test::identical("fixUTF8() Example 2 still working.",
Encoding::fixUTF8("Fédération Camerounaise de Football\n"),
"Fédération Camerounaise de Football\n");
Test::identical("fixUTF8() Example 3 still working.",
Encoding::fixUTF8("Fédération Camerounaise de Football\n"),
"Fédération Camerounaise de Football\n");
Test::identical("fixUTF8() Example 4 still working.",
Encoding::fixUTF8("Fédération Camerounaise de Football\n"),
"Fédération Camerounaise de Football\n");
Test::totals();

View File

@@ -1,62 +0,0 @@
<?php
class Test {
protected static $passed = 0;
protected static $failed = 0;
protected static $last_echoed;
public static function true($test_name, $result){
return static::is($test_name, $result, TRUE);
}
public static function is($test_name, $result, $expected){
if($result == $expected) {
static::passed($test_name);
} else {
static::failed($test_name);
}
}
public static function not($test_name, $result, $expected){
if($result == $expected) {
static::failed($test_name);
} else {
static::passed($test_name);
}
}
public static function identical($test_name, $result, $expected){
if($result === $expected) {
static::passed($test_name);
} else {
static::failed($test_name);
}
}
public static function totals(){
echo "\n";
echo static::$passed." tests passed.\n";
echo static::$failed." tests failed.\n";
}
private static function failed($test_name){
echo "\n".$test_name." -> FAILED\n";
static::$failed++;
}
private static function passed($test_name){
static::character(".");
static::$passed++;
}
private static function character($char){
echo $char;
static::$last_echoed = 'char';
}
private static function line($msg){
if(static::$last_echoed == 'char') echo "\n";
echo $msg."\n";
static::$last_echoed = 'line';
}
}

View File

@@ -1 +0,0 @@
hello žš, привет

View File

@@ -1 +0,0 @@
Hírek

View File

@@ -1 +0,0 @@
H<EFBFBD>rek