|
| 1 | +// file from "golang.org\x\text\encoding\internal\identifier" (c) golang autors |
| 2 | +// contain identifier of code page |
| 3 | +// IDCodePage implements interface String() |
| 4 | + |
| 5 | +package cpd |
| 6 | + |
| 7 | +import ( |
| 8 | + "fmt" |
| 9 | + "strings" |
| 10 | +) |
| 11 | + |
| 12 | +//IDCodePage - index of code page |
| 13 | +type IDCodePage uint16 |
| 14 | + |
| 15 | +func (i IDCodePage) String() string { |
| 16 | + return codePageName[i] |
| 17 | +} |
| 18 | + |
| 19 | +//itRuneMatch - return 1 if rune from this code page, 0 else |
| 20 | +type itRuneMatch func(r rune, tbl *codePageTable) int |
| 21 | + |
| 22 | +//runesMatch - return count of entry elements of data to code page |
| 23 | +type runesMatch func(data []byte, tbl *codePageTable) int |
| 24 | + |
| 25 | +type tableElement struct { |
| 26 | + code rune //руна которая нас интересует, она присутствует в этой кодовой таблице как буква алфавита |
| 27 | + count int //количество вхождений данной руны |
| 28 | +} |
| 29 | + |
| 30 | +//codePageTable - содержит основные (наиболее часто встречаемые) символы алфавита в данной кодировке |
| 31 | +//первые 8 прописные, 2-я восьмёрка заглавные |
| 32 | +type codePageTable [19]tableElement |
| 33 | + |
| 34 | +//MatchRes - итоговый критерий совпадения массива данных с кодовой страницей |
| 35 | +type MatchRes struct { |
| 36 | + countMatch int |
| 37 | +} |
| 38 | + |
| 39 | +//CodePage - содержит данные по конкретной кодовой странице |
| 40 | +type CodePage struct { |
| 41 | + id IDCodePage //id of code page |
| 42 | + name string //name of code page |
| 43 | + MatchRes //count of matching |
| 44 | + match runesMatch //calculate from input data count of entry to codepage |
| 45 | + table codePageTable //table of main alfabet rune of this code page, use for calculate frequency |
| 46 | +} |
| 47 | + |
| 48 | +func (o CodePage) String() string { |
| 49 | + return fmt.Sprintf("id: %s, countMatch: %d", o.id, o.countMatch) |
| 50 | +} |
| 51 | + |
| 52 | +//MatchingRunes - return string with rune/counts |
| 53 | +func (o CodePage) MatchingRunes() string { |
| 54 | + var sb strings.Builder |
| 55 | + fmt.Fprint(&sb, "rune/counts: ") |
| 56 | + for i, e := range o.table { |
| 57 | + if i != 0 { |
| 58 | + fmt.Fprintf(&sb, "%x/%d, ", e.code, e.count) |
| 59 | + } |
| 60 | + } |
| 61 | + return sb.String() |
| 62 | +} |
| 63 | + |
| 64 | +//TCodePages - type for store all code page |
| 65 | +type TCodePages []CodePage |
| 66 | + |
| 67 | +//DeepMach - |
| 68 | +func (o *TCodePages) DeepMach(data []byte) IDCodePage { |
| 69 | + return ASCII |
| 70 | +} |
| 71 | + |
| 72 | +//Match - return IDCodePage |
| 73 | +//simple calculate count entry data runes in standart code page table |
| 74 | +func (o TCodePages) Match(data []byte) (result IDCodePage) { |
| 75 | + result = ASCII |
| 76 | + maxCount := 0 |
| 77 | + for i, cp := range o { |
| 78 | + o[i].countMatch = cp.match(data, &o[i].table) |
| 79 | + if o[i].countMatch > maxCount { |
| 80 | + maxCount = o[i].countMatch |
| 81 | + result = cp.id |
| 82 | + } |
| 83 | + } |
| 84 | + return result |
| 85 | +} |
| 86 | + |
| 87 | +//CodePages - slice of code pages |
| 88 | +var CodePages = TCodePages{ |
| 89 | + {ASCII, "ASCII", MatchRes{0}, runesMatchASCII, |
| 90 | + codePageTable{{0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}}}, |
| 91 | + {IBM866, "IBM866", MatchRes{0}, runesMatch866, |
| 92 | + codePageTable{ |
| 93 | + //first element serves as sign of absence |
| 94 | + {0, 0}, |
| 95 | + //о е а и н т с р в |
| 96 | + {0xAE, 0}, {0xA5, 0}, {0xA0, 0}, {0xA8, 0}, {0xAD, 0}, {0xE2, 0}, {0xE1, 0}, {0xE0, 0}, {0xA2, 0}, |
| 97 | + {0x8E, 0}, {0x85, 0}, {0x80, 0}, {0x88, 0}, {0x8D, 0}, {0x92, 0}, {0x91, 0}, {0x90, 0}, {0x82, 0}}}, |
| 98 | + {UTF8, "UTF8", MatchRes{0}, runesMatchUTF8, |
| 99 | + codePageTable{ |
| 100 | + {0, 0}, |
| 101 | + //о е а и н т с р в |
| 102 | + {0xD0BE, 0}, {0xD0B5, 0}, {0xD0B0, 0}, {0xD0B8, 0}, {0xD0BD, 0}, {0xD182, 0}, {0xD181, 0}, {0xD180, 0}, {0xD0B2, 0}, |
| 103 | + {0xD09E, 0}, {0xD095, 0}, {0xD090, 0}, {0xD098, 0}, {0xD0AD, 0}, {0xD0A2, 0}, {0xD0A1, 0}, {0xD0A0, 0}, {0xD092, 0}}}, |
| 104 | + {Windows1251, "Windows1251", MatchRes{0}, runesMatch1251, |
| 105 | + codePageTable{ |
| 106 | + {0, 0}, |
| 107 | + //а и н с р в л к в |
| 108 | + {0xE0, 0}, {0xE8, 0}, {0xED, 0}, {0xF1, 0}, {0xF0, 0}, {0xE2, 0}, {0xEB, 0}, {0xEA, 0}, {0xE2, 0}, |
| 109 | + {0xC0, 0}, {0xC8, 0}, {0xCD, 0}, {0xD1, 0}, {0xD0, 0}, {0xC2, 0}, {0xCB, 0}, {0xCA, 0}, {0xC2, 0}}}, |
| 110 | + {KOI8R, "KOI8R", MatchRes{0}, runesMatchKOI8, |
| 111 | + codePageTable{ |
| 112 | + //о а и т с в л к в |
| 113 | + {0, 0}, |
| 114 | + {0xCF, 0}, {0xC1, 0}, {0xC9, 0}, {0xD4, 0}, {0xD3, 0}, {0xD7, 0}, {0xCC, 0}, {0xCB, 0}, {0xD7, 0}, |
| 115 | + {0xEF, 0}, {0xE1, 0}, {0xE9, 0}, {0xF4, 0}, {0xF3, 0}, {0xF7, 0}, {0xEC, 0}, {0xEB, 0}, {0xF7, 0}}}, |
| 116 | +} |
| 117 | + |
| 118 | +//codePageName - string of code page name |
| 119 | +var codePageName = map[IDCodePage]string{ |
| 120 | + ASCII: "ASCII", |
| 121 | + IBM866: "IBM866", |
| 122 | + Windows1251: "Windows1251", |
| 123 | + UTF8: "UTF8", |
| 124 | + UTF16: "UTF16", |
| 125 | + UTF16LE: "UTF16LE", |
| 126 | + UTF16BE: "UTF16BE", |
| 127 | + UTF32: "UTF32", |
| 128 | + KOI8R: "KOI8R", |
| 129 | + ISO5427Cyrillic: "ISO5427Cyrillic", |
| 130 | + ISO51INISCyrillic: "ISO51INISCyrillic", |
| 131 | + ISO111ECMACyrillic: "ISO111ECMACyrillic", |
| 132 | + ISO153GOST1976874: "ISO153GOST1976874", |
| 133 | + Unicode: "Unicode", |
| 134 | +} |
0 commit comments