СообЧа > База Знаний > Программирование > Delphi

Вопрос

Существуют ли в сободном для изучения доступе алгоритмы автоматического определения кодировки текста?

Ответ

О, еще сколько. Методом таблицы модельных распределений:

type
   TCodePage = (cpWin1251, cp866, cpKOI8R);
   PMap = ^TMap;
   TMap = array [#$80..#$FF] of Char;

function GetMap(CP: TCodePage): PMap;
{ должна возвращать указатель на таблицу
перекодировки из CP в Windows1251
   (nil для CP = cpWin1251) }
begin
   GetMap:=nil;
end;

function DetermineRussian(Buf: PChar; Count: Integer): TCodePage;
const
   ModelBigrams: array [0..33, 0..33] of Byte = ({АБВГДЕЖЗИЙКЛМHОПРСТУФХЦЧШЩЪЫЬЭЮЯ_Е}
{А}(0,20,44,12,22,23,16,60,4,9,63,93,47,110,
0,16,35,61,81,1,5,13,24,17,12,4,0,0,0,0,14,31,205,1),
{Б}(19,0,0,0,4,19,0,0,8,0,2,15,1,4,41,0,15,5,
0,15,0,2,1,0,0,6,16,37,0,0,0,4,3,0),
{В}(97,0,1,0,2,57,0,5,40,0,4,25,2,23,78,2,8,28,
4,12,0,1,0,0,8,1,0,40,1,0,0,5,106,3),
{Г}(13,0,0,0,9,5,0,0,15,0,1,17,1,2,96,0,24,0,0,
7,0,0,0,0,0,0,0,0,0,0,0,0,8,0),
{Д}(63,0,9,1,2,71,1,0,35,0,3,16,2,22,50,2,19,9,
2,25,0,2,1,0,1,0,1,9,4,0,1,5,17,4),
{Е}(4,14,15,34,56,22,13,14,2,34,39,77,73,150,6,
9,101,64,81,1,0,15,5,12,10,6,0,0,0,0,3,4,235,1),
{Ж}(13,0,0,0,12,47,0,0,16,0,1,0,0,23,0,0,0,0,0,3,
0,0,0,0,0,0,0,0,0,0,0,0,2,2),
{З}(76,2,11,3,11,4,1,0,7,0,2,4,11,24,17,0,6,1,0,8,
0,0,0,0,0,0,0,16,6,0,1,4,17,0),
{И}(7,9,32,5,18,60,4,42,31,27,28,46,55,49,12,7,26,
60,53,0,5,25,14,28,4,1,0,0,0,0,9,56,255,0),
{Й}(0,0,0,0,2,0,0,0,0,0,1,3,0,3,0,0,0,10,3,0,0,0,0,
1,1,0,0,0,0,0,0,0,122,0),
{К}(92,0,3,0,0,7,2,1,39,0,0,27,0,14,110,0,18,5,35,
18,0,0,11,0,0,0,0,0,0,0,0,0,5,5,0),
{Л}(85,1,0,2,1,70,6,0,85,0,5,3,0,9,67,1,0,9,0,15,0,
0,0,2,0,0,0,9,66,0,15,43,57,4),
{М}(44,0,0,0,0,65,0,0,47,0,1,1,10,15,57,7,0,2,0,24,
0,0,0,0,0,0,0,28,0,0,0,8,109,3),
{}(139,0,0,1,11,108,0,4,152,0,7,0,1,69,161,0,0,8,25,
24,5,1,5,2,0,1,0,83,10,0,1,29,38,5),
{О}(0,72,139,76,74,32,32,19,12,52,21,93,68,72,7,34,
93,102,98,1,2,6,6,19,15,2,0,0,0,1,4,9,252,2),
{П}(17,0,0,0,0,43,0,0,14,0,1,9,0,1,125,3,120,1,2,8,
0,0,0,0,0,0,0,3,6,0,0,3,2,2),
{Р}(151,1,6,4,3,103,7,0,76,0,4,0,11,10,117,1,0,5,9,
39,2,5,0,1,3,0,0,24,7,0,1,10,22,5),
{С}(24,1,21,0,3,39,0,0,33,0,56,41,11,15,58,30,5,30,
183,16,0,4,1,4,1,0,0,8,25,0,1,50,41,2),
{Т}(83,0,43,0,3,87,0,0,71,0,9,3,2,26,180,0,55,33,1,
23,1,0,1,4,0,0,0,20,78,0,0,5,82,4),
{У}(3,6,7,14,19,8,13,6,0,1,13,15,10,7,0,12,17,16,19,
0,1,3,0,12,5,8,0,0,0,0,22,1,65,0),
{Ф}(4,0,0,0,0,4,0,0,11,0,0,1,0,0,9,0,3,0,0,4,1,0,0,0,
0,0,0,0,0,0,0,0,2,0),
{Х}(9,0,2,0,0,2,0,0,5,0,0,1,0,5,26,0,4,1,0,1,0,0,0,0,
0,0,0,0,0,0,0,0,76,0),
{Ц}(5,0,0,0,0,16,0,0,48,0,1,0,0,0,4,0,0,0,0,3,0,0,0,
0,0,0,0,2,0,0,0,0,3,0),
{Ч}(30,0,0,0,0,52,0,0,23,0,3,1,0,14,1,0,0,0,36,5,0,0,
0,0,1,0,0,0,1,0,0,0,2,2),
{Ш}(13,0,0,0,0,28,0,0,17,0,4,4,0,4,3,0,0,0,1,3,0,0,0,
0,0,0,0,0,3,0,0,0,1,1),
{Щ}(6,0,0,0,0,23,0,0,16,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,
0,0,0,0,1,0,0,0,0,1),
{Ъ}(0,0,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,1,1,0,0),
{Ы}(0,5,14,1,3,28,0,2,0,22,6,19,21,2,0,5,4,7,10,0,0,37,
0,3,4,0,0,0,0,0,0,1,84,0),
{Ь}(0,1,0,0,0,9,0,10,1,0,13,0,2,26,0,0,0,10,3,0,0,0,1,
0,6,0,0,0,0,0,6,4,117,0),
{Э}(0,0,0,0,0,0,0,0,0,0,3,3,0,0,0,0,0,0,31,0,1,0,0,0,0,
0,0,0,0,0,0,0,0,0),
{Ю}(0,5,0,0,3,0,0,0,0,0,0,1,0,0,0,0,0,1,15,0,0,0,1,4,1,
15,0,0,0,0,0,0,38,0),
{Я}(0,0,9,2,7,10,3,19,0,0,1,6,7,8,0,0,2,6,19,0,0,3,5,1,
0,3,0,0,0,0,5,2,177,0),
{_}(42,80,193,43,109,41,18,53,159,0,144,27,83,176,187,
229,70,231,99,47,15,13,6,58,7,0,0,0,0,38,0,22,0,2),
{Е}(0,0,0,0,3,0,0,0,0,0,2,4,4,8,0,0,5,3,4,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0));
{ "рейтинг" буквы Е условно
принимается равным 1/20 от "рейтинга" буквы E,
   если сочетание с участием Е корректно, иначе — 0
}

type
   TVariation = array [0..33, 0..33] of Integer;
var
    I, J, iC, iPredC, Max: Integer;
    C: Char;
    CP: TCodePage;
    D, MinD, Factor: Double;
   AMap: PMap;
    PV: ^TVariation;
   Vars: array [TCodePage] of TVariation;
begin
  DetermineRussian:=cpWin1251; { по yмолчанию }
    { вычисление распределений биграмм }
    FillChar(Vars, SizeOf(Vars), 0);
  for CP:=Low(Vars) to High(Vars) do begin
      AMap:=GetMap(CP);
      PV:=@Vars[CP];
      iPredC:=32;
      for I:=0 to Count — 1 do begin
        C:=Buf[I];
        iC:=32;
   if C >= #128 then begin
   if AMap <> nil then C:=AMap^[C];
     if not (C in ['Е', 'е']) then begin
  C:=Chr(Ord© and not 32); { 'a'..'я' -> 'А'..'Я' }
 if C in ['А'..'Я'] then iC:=Ord© — Ord('А');
   end
      else
      iC:=33;
   end;
    Inc(PV^[iPredC, iC]);
    iPredC:=iC;
    end;
    end;
{ вычисление метрики и определение наиболее
правдоподобной кодировки }
    MinD:=0;
 for CP:=Low(Vars) to High(Vars) do begin
   PV:=@Vars[CP];
  PV^[32, 32]:=0;
    Max:=1;
   for I:=0 to 33 do
   for J:=0 to 33 do
 if PV^[I, J] > Max then Max:=PV^[I, J];
  Factor:=255 / Max; { ноpмализация }
   D:=0;
  for I:=0 to 33 do
     for J:=0 to 33 do
    D:=D + Abs(PV^[I, J] * Factor — ModelBigrams[I, J]);
  if (MinD = 0) or (D < MinD) then begin
     MinD:=D;
   DetermineRussian:=CP;
   end;
    end;
end;

begin
{ тест: слово 'Пример' в разных кодировках
(веpоятность ошибок на таких
коpотких текстах высока — в данном слyчае
пpосто повезло!) }
writeln(DetermineRussian(#$CF#$F0#$E8#$EC#$E5#$F0, 6) = cpWin1251);
writeln(DetermineRussian(#$8F#$E0#$A8#$AC#$A5#$E0, 6) = cp866);
writeln(DetermineRussian(#$F0#$D2#$C9#$CD#$C5#$D2, 6) = cpKOI8R);
 readln;
end.


Stas Malinovski



Copyright © 2000-2004 Сообщество Чайников
Контактная информация