Skip to content

Commit 4387137

Browse files
committed
v0.2.0
1 parent 54bfa0d commit 4387137

30 files changed

+633
-1786
lines changed

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
*.zip
22
*.7z
3-
.idea/*
3+
.idea/*
4+
tmp*

.vscode/launch.json

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
{
2+
// Use IntelliSense to learn about possible attributes.
3+
// Hover to view descriptions of existing attributes.
4+
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5+
"version": "0.2.0",
6+
"configurations": [
7+
{
8+
"name": "Launch",
9+
"type": "go",
10+
"request": "launch",
11+
"mode": "auto",
12+
"program": "${fileDirname}",
13+
"env": {},
14+
"args": []
15+
}
16+
]
17+
}

char_frac.xlsx

12 KB
Binary file not shown.

codePageTable.go

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
package cpd
2+
3+
//codePageTable
4+
5+
//return index of rune in code page table
6+
//return 0 if rune not in code page table
7+
func (t *codePageTable) containsRune(r rune) int {
8+
for j, e := range *t {
9+
if r == e.code {
10+
return j
11+
}
12+
}
13+
return 0
14+
}
15+
16+
func (t *codePageTable) isUpper(r rune) bool {
17+
for i := 10; i < len(t); i++ {
18+
if r == (*t)[i].code {
19+
return true
20+
}
21+
}
22+
return false
23+
}

code_pages.go

+134
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
// file from "golang.org\x\text\encoding\internal\identifier" (c) golang autors
2+
// contain identifier of code page
3+
// IDCodePage implements interface String()
4+
5+
package cpd
6+
7+
import (
8+
"fmt"
9+
"strings"
10+
)
11+
12+
//IDCodePage - index of code page
13+
type IDCodePage uint16
14+
15+
func (i IDCodePage) String() string {
16+
return codePageName[i]
17+
}
18+
19+
//itRuneMatch - return 1 if rune from this code page, 0 else
20+
type itRuneMatch func(r rune, tbl *codePageTable) int
21+
22+
//runesMatch - return count of entry elements of data to code page
23+
type runesMatch func(data []byte, tbl *codePageTable) int
24+
25+
type tableElement struct {
26+
code rune //руна которая нас интересует, она присутствует в этой кодовой таблице как буква алфавита
27+
count int //количество вхождений данной руны
28+
}
29+
30+
//codePageTable - содержит основные (наиболее часто встречаемые) символы алфавита в данной кодировке
31+
//первые 8 прописные, 2-я восьмёрка заглавные
32+
type codePageTable [19]tableElement
33+
34+
//MatchRes - итоговый критерий совпадения массива данных с кодовой страницей
35+
type MatchRes struct {
36+
countMatch int
37+
}
38+
39+
//CodePage - содержит данные по конкретной кодовой странице
40+
type CodePage struct {
41+
id IDCodePage //id of code page
42+
name string //name of code page
43+
MatchRes //count of matching
44+
match runesMatch //calculate from input data count of entry to codepage
45+
table codePageTable //table of main alfabet rune of this code page, use for calculate frequency
46+
}
47+
48+
func (o CodePage) String() string {
49+
return fmt.Sprintf("id: %s, countMatch: %d", o.id, o.countMatch)
50+
}
51+
52+
//MatchingRunes - return string with rune/counts
53+
func (o CodePage) MatchingRunes() string {
54+
var sb strings.Builder
55+
fmt.Fprint(&sb, "rune/counts: ")
56+
for i, e := range o.table {
57+
if i != 0 {
58+
fmt.Fprintf(&sb, "%x/%d, ", e.code, e.count)
59+
}
60+
}
61+
return sb.String()
62+
}
63+
64+
//TCodePages - type for store all code page
65+
type TCodePages []CodePage
66+
67+
//DeepMach -
68+
func (o *TCodePages) DeepMach(data []byte) IDCodePage {
69+
return ASCII
70+
}
71+
72+
//Match - return IDCodePage
73+
//simple calculate count entry data runes in standart code page table
74+
func (o TCodePages) Match(data []byte) (result IDCodePage) {
75+
result = ASCII
76+
maxCount := 0
77+
for i, cp := range o {
78+
o[i].countMatch = cp.match(data, &o[i].table)
79+
if o[i].countMatch > maxCount {
80+
maxCount = o[i].countMatch
81+
result = cp.id
82+
}
83+
}
84+
return result
85+
}
86+
87+
//CodePages - slice of code pages
88+
var CodePages = TCodePages{
89+
{ASCII, "ASCII", MatchRes{0}, runesMatchASCII,
90+
codePageTable{{0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}}},
91+
{IBM866, "IBM866", MatchRes{0}, runesMatch866,
92+
codePageTable{
93+
//first element serves as sign of absence
94+
{0, 0},
95+
//о е а и н т с р в
96+
{0xAE, 0}, {0xA5, 0}, {0xA0, 0}, {0xA8, 0}, {0xAD, 0}, {0xE2, 0}, {0xE1, 0}, {0xE0, 0}, {0xA2, 0},
97+
{0x8E, 0}, {0x85, 0}, {0x80, 0}, {0x88, 0}, {0x8D, 0}, {0x92, 0}, {0x91, 0}, {0x90, 0}, {0x82, 0}}},
98+
{UTF8, "UTF8", MatchRes{0}, runesMatchUTF8,
99+
codePageTable{
100+
{0, 0},
101+
//о е а и н т с р в
102+
{0xD0BE, 0}, {0xD0B5, 0}, {0xD0B0, 0}, {0xD0B8, 0}, {0xD0BD, 0}, {0xD182, 0}, {0xD181, 0}, {0xD180, 0}, {0xD0B2, 0},
103+
{0xD09E, 0}, {0xD095, 0}, {0xD090, 0}, {0xD098, 0}, {0xD0AD, 0}, {0xD0A2, 0}, {0xD0A1, 0}, {0xD0A0, 0}, {0xD092, 0}}},
104+
{Windows1251, "Windows1251", MatchRes{0}, runesMatch1251,
105+
codePageTable{
106+
{0, 0},
107+
//а и н с р в л к в
108+
{0xE0, 0}, {0xE8, 0}, {0xED, 0}, {0xF1, 0}, {0xF0, 0}, {0xE2, 0}, {0xEB, 0}, {0xEA, 0}, {0xE2, 0},
109+
{0xC0, 0}, {0xC8, 0}, {0xCD, 0}, {0xD1, 0}, {0xD0, 0}, {0xC2, 0}, {0xCB, 0}, {0xCA, 0}, {0xC2, 0}}},
110+
{KOI8R, "KOI8R", MatchRes{0}, runesMatchKOI8,
111+
codePageTable{
112+
//о а и т с в л к в
113+
{0, 0},
114+
{0xCF, 0}, {0xC1, 0}, {0xC9, 0}, {0xD4, 0}, {0xD3, 0}, {0xD7, 0}, {0xCC, 0}, {0xCB, 0}, {0xD7, 0},
115+
{0xEF, 0}, {0xE1, 0}, {0xE9, 0}, {0xF4, 0}, {0xF3, 0}, {0xF7, 0}, {0xEC, 0}, {0xEB, 0}, {0xF7, 0}}},
116+
}
117+
118+
//codePageName - string of code page name
119+
var codePageName = map[IDCodePage]string{
120+
ASCII: "ASCII",
121+
IBM866: "IBM866",
122+
Windows1251: "Windows1251",
123+
UTF8: "UTF8",
124+
UTF16: "UTF16",
125+
UTF16LE: "UTF16LE",
126+
UTF16BE: "UTF16BE",
127+
UTF32: "UTF32",
128+
KOI8R: "KOI8R",
129+
ISO5427Cyrillic: "ISO5427Cyrillic",
130+
ISO51INISCyrillic: "ISO51INISCyrillic",
131+
ISO111ECMACyrillic: "ISO111ECMACyrillic",
132+
ISO153GOST1976874: "ISO153GOST1976874",
133+
Unicode: "Unicode",
134+
}

code_pages_id.go

+111
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
package cpd
2+
3+
const (
4+
// ASCII is the uint16 identifier with IANA name US-ASCII (MIME: US-ASCII).
5+
// ANSI X3.4-1986
6+
// Reference: RFC2046
7+
ASCII IDCodePage = 3
8+
9+
// ISO5427Cyrillic is the uint16 identifier with IANA name ISO_5427.
10+
// ISO-IR: International Register of Escape Sequences
11+
// Note: The current registration authority is IPSJ/ITSCJ, Japan.
12+
// Reference: RFC1345
13+
ISO5427Cyrillic IDCodePage = 48
14+
15+
// ISO51INISCyrillic is the uint16 identifier with IANA name INIS-cyrillic.
16+
// ISO-IR: International Register of Escape Sequences
17+
// Note: The current registration authority is IPSJ/ITSCJ, Japan.
18+
// Reference: RFC1345
19+
ISO51INISCyrillic IDCodePage = 53
20+
21+
// ISO111ECMACyrillic is the uint16 identifier with IANA name ECMA-cyrillic.
22+
// ISO registry
23+
// (formerly ECMA registry )
24+
ISO111ECMACyrillic IDCodePage = 77
25+
26+
// ISO153GOST1976874 is the uint16 identifier with IANA name GOST_19768-74.
27+
// ISO-IR: International Register of Escape Sequences
28+
// Note: The current registration authority is IPSJ/ITSCJ, Japan.
29+
// Reference: RFC1345
30+
ISO153GOST1976874 IDCodePage = 94
31+
32+
// UTF8 is the uint16 identifier with IANA name UTF-8.
33+
//
34+
// rfc3629
35+
// Reference: RFC3629
36+
UTF8 IDCodePage = 106
37+
38+
// Unicode is the uint16 identifier with IANA name ISO-10646-UCS-2.
39+
//
40+
// the 2-octet Basic Multilingual Plane, aka Unicode
41+
// this needs to specify network byte order: the standard
42+
// does not specify (it is a 16-bit integer space)
43+
Unicode IDCodePage = 1000
44+
45+
// UnicodeASCII is the uint16 identifier with IANA name ISO-10646-UCS-Basic.
46+
//
47+
// ASCII subset of Unicode. Basic Latin = collection 1
48+
// See ISO 10646, Appendix A
49+
UnicodeASCII IDCodePage = 1002
50+
51+
// UTF7 is the uint16 identifier with IANA name UTF-7.
52+
//
53+
// rfc2152
54+
// Reference: RFC2152
55+
UTF7 IDCodePage = 1012
56+
57+
// UTF16BE is the uint16 identifier with IANA name UTF-16BE.
58+
//
59+
// rfc2781
60+
// Reference: RFC2781
61+
UTF16BE IDCodePage = 1013
62+
63+
// UTF16LE is the uint16 identifier with IANA name UTF-16LE.
64+
//
65+
// rfc2781
66+
// Reference: RFC2781
67+
UTF16LE IDCodePage = 1014
68+
69+
// UTF16 is the uint16 identifier with IANA name UTF-16.
70+
//
71+
// rfc2781
72+
// Reference: RFC2781
73+
UTF16 IDCodePage = 1015
74+
75+
// UTF32 is the uint16 identifier with IANA name UTF-32.
76+
//
77+
// https://www.unicode.org/unicode/reports/tr19/
78+
UTF32 IDCodePage = 1017
79+
80+
// UTF32BE is the uint16 identifier with IANA name UTF-32BE.
81+
//
82+
// https://www.unicode.org/unicode/reports/tr19/
83+
UTF32BE IDCodePage = 1018
84+
85+
// UTF32LE is the uint16 identifier with IANA name UTF-32LE.
86+
//
87+
// https://www.unicode.org/unicode/reports/tr19/
88+
UTF32LE IDCodePage = 1019
89+
90+
// KOI8R is the uint16 identifier with IANA name KOI8-R (MIME: KOI8-R).
91+
//
92+
// rfc1489 , based on GOST-19768-74, ISO-6937/8,
93+
// INIS-Cyrillic, ISO-5427.
94+
// Reference: RFC1489
95+
KOI8R IDCodePage = 2084
96+
97+
// IBM866 is the uint16 identifier with IANA name IBM866.
98+
//
99+
// IBM NLDG Volume 2 (SE09-8002-03) August 1994
100+
IBM866 IDCodePage = 2086
101+
102+
// Windows1251 is the uint16 identifier with IANA name windows-1251.
103+
//
104+
// Microsoft http://www.iana.org/assignments/charset-reg/windows-1251
105+
Windows1251 IDCodePage = 2251
106+
107+
// Windows1252 is the uint16 identifier with IANA name windows-1252.
108+
//
109+
// Microsoft http://www.iana.org/assignments/charset-reg/windows-1252
110+
Windows1252 IDCodePage = 2252
111+
)

const.go

-15
This file was deleted.

cp_deep_maching.go

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
package cpd
2+
3+
//checkHeader - check buffer for match to utf-8, utf-16le or utf-16be BOM
4+
func checkHeader(b []byte) (id IDCodePage, res bool) {
5+
if bomUTF8(b) {
6+
return UTF8, true
7+
}
8+
if bomUTF16le(b) {
9+
return UTF16LE, true
10+
}
11+
if bomUTF16be(b) {
12+
return UTF16BE, true
13+
}
14+
return ASCII, false
15+
}
16+
17+
func bomUTF8(b []byte) bool {
18+
if len(b) < 3 {
19+
return false
20+
}
21+
return (b[0] == 0xEF) && (b[1] == 0xBB) && (b[2] == 0xBF)
22+
}
23+
24+
func bomUTF16le(b []byte) bool {
25+
if len(b) < 2 {
26+
return false
27+
}
28+
return (b[0] == 0xFF) && (b[1] == 0xFE)
29+
}
30+
31+
func bomUTF16be(b []byte) bool {
32+
if len(b) < 2 {
33+
return false
34+
}
35+
return (b[0] == 0xFE) && (b[1] == 0xFF)
36+
}
37+
38+
//ASCII block
39+
func itASCII(r rune, tbl *codePageTable) int {
40+
return 0
41+
}
42+
43+
func runesMatchASCII(b []byte, tbl *codePageTable) int {
44+
return 0
45+
}

0 commit comments

Comments
 (0)