mirror of
https://github.com/Jinnrry/PMail.git
synced 2025-10-30 11:26:26 +08:00
427 lines
10 KiB
Go
427 lines
10 KiB
Go
package parsemail
|
||
|
||
// copy from https://cs.opensource.google/go/go/+/refs/tags/go1.20.4:src/mime/encodedword.go
|
||
// Golang官方库的解码函数不支持中文编码,此处实现支持了中文gbk和gb18030编码
|
||
|
||
import (
|
||
"bytes"
|
||
"encoding/base64"
|
||
"errors"
|
||
"fmt"
|
||
"golang.org/x/text/encoding/simplifiedchinese"
|
||
"io"
|
||
"strings"
|
||
"unicode"
|
||
"unicode/utf8"
|
||
)
|
||
|
||
// A WordEncoder is an RFC 2047 encoded-word encoder.
|
||
type WordEncoder byte
|
||
|
||
const (
|
||
// BEncoding represents Base64 encoding scheme as defined by RFC 2045.
|
||
BEncoding = WordEncoder('b')
|
||
// QEncoding represents the Q-encoding scheme as defined by RFC 2047.
|
||
QEncoding = WordEncoder('q')
|
||
)
|
||
|
||
var (
|
||
errInvalidWord = errors.New("mime: invalid RFC 2047 encoded-word")
|
||
)
|
||
|
||
// Encode returns the encoded-word form of s. If s is ASCII without special
|
||
// characters, it is returned unchanged. The provided charset is the IANA
|
||
// charset name of s. It is case insensitive.
|
||
func (e WordEncoder) Encode(charset, s string) string {
|
||
if !needsEncoding(s) {
|
||
return s
|
||
}
|
||
return e.encodeWord(charset, s)
|
||
}
|
||
|
||
func needsEncoding(s string) bool {
|
||
for _, b := range s {
|
||
if (b < ' ' || b > '~') && b != '\t' {
|
||
return true
|
||
}
|
||
}
|
||
return false
|
||
}
|
||
|
||
// encodeWord encodes a string into an encoded-word.
|
||
func (e WordEncoder) encodeWord(charset, s string) string {
|
||
var buf strings.Builder
|
||
// Could use a hint like len(s)*3, but that's not enough for cases
|
||
// with word splits and too much for simpler inputs.
|
||
// 48 is close to maxEncodedWordLen/2, but adjusted to allocator size class.
|
||
buf.Grow(48)
|
||
|
||
e.openWord(&buf, charset)
|
||
if e == BEncoding {
|
||
e.bEncode(&buf, charset, s)
|
||
} else {
|
||
e.qEncode(&buf, charset, s)
|
||
}
|
||
closeWord(&buf)
|
||
|
||
return buf.String()
|
||
}
|
||
|
||
const (
|
||
// The maximum length of an encoded-word is 75 characters.
|
||
// See RFC 2047, section 2.
|
||
maxEncodedWordLen = 75
|
||
// maxContentLen is how much content can be encoded, ignoring the header and
|
||
// 2-byte footer.
|
||
maxContentLen = maxEncodedWordLen - len("=?UTF-8?q?") - len("?=")
|
||
)
|
||
|
||
var maxBase64Len = base64.StdEncoding.DecodedLen(maxContentLen)
|
||
|
||
// bEncode encodes s using base64 encoding and writes it to buf.
|
||
func (e WordEncoder) bEncode(buf *strings.Builder, charset, s string) {
|
||
w := base64.NewEncoder(base64.StdEncoding, buf)
|
||
// If the charset is not UTF-8 or if the content is short, do not bother
|
||
// splitting the encoded-word.
|
||
if !isUTF8(charset) || base64.StdEncoding.EncodedLen(len(s)) <= maxContentLen {
|
||
io.WriteString(w, s)
|
||
w.Close()
|
||
return
|
||
}
|
||
|
||
var currentLen, last, runeLen int
|
||
for i := 0; i < len(s); i += runeLen {
|
||
// Multi-byte characters must not be split across encoded-words.
|
||
// See RFC 2047, section 5.3.
|
||
_, runeLen = utf8.DecodeRuneInString(s[i:])
|
||
|
||
if currentLen+runeLen <= maxBase64Len {
|
||
currentLen += runeLen
|
||
} else {
|
||
io.WriteString(w, s[last:i])
|
||
w.Close()
|
||
e.splitWord(buf, charset)
|
||
last = i
|
||
currentLen = runeLen
|
||
}
|
||
}
|
||
io.WriteString(w, s[last:])
|
||
w.Close()
|
||
}
|
||
|
||
// qEncode encodes s using Q encoding and writes it to buf. It splits the
|
||
// encoded-words when necessary.
|
||
func (e WordEncoder) qEncode(buf *strings.Builder, charset, s string) {
|
||
// We only split encoded-words when the charset is UTF-8.
|
||
if !isUTF8(charset) {
|
||
writeQString(buf, s)
|
||
return
|
||
}
|
||
|
||
var currentLen, runeLen int
|
||
for i := 0; i < len(s); i += runeLen {
|
||
b := s[i]
|
||
// Multi-byte characters must not be split across encoded-words.
|
||
// See RFC 2047, section 5.3.
|
||
var encLen int
|
||
if b >= ' ' && b <= '~' && b != '=' && b != '?' && b != '_' {
|
||
runeLen, encLen = 1, 1
|
||
} else {
|
||
_, runeLen = utf8.DecodeRuneInString(s[i:])
|
||
encLen = 3 * runeLen
|
||
}
|
||
|
||
if currentLen+encLen > maxContentLen {
|
||
e.splitWord(buf, charset)
|
||
currentLen = 0
|
||
}
|
||
writeQString(buf, s[i:i+runeLen])
|
||
currentLen += encLen
|
||
}
|
||
}
|
||
|
||
// writeQString encodes s using Q encoding and writes it to buf.
|
||
func writeQString(buf *strings.Builder, s string) {
|
||
for i := 0; i < len(s); i++ {
|
||
switch b := s[i]; {
|
||
case b == ' ':
|
||
buf.WriteByte('_')
|
||
case b >= '!' && b <= '~' && b != '=' && b != '?' && b != '_':
|
||
buf.WriteByte(b)
|
||
default:
|
||
buf.WriteByte('=')
|
||
buf.WriteByte(upperhex[b>>4])
|
||
buf.WriteByte(upperhex[b&0x0f])
|
||
}
|
||
}
|
||
}
|
||
|
||
// openWord writes the beginning of an encoded-word into buf.
|
||
func (e WordEncoder) openWord(buf *strings.Builder, charset string) {
|
||
buf.WriteString("=?")
|
||
buf.WriteString(charset)
|
||
buf.WriteByte('?')
|
||
buf.WriteByte(byte(e))
|
||
buf.WriteByte('?')
|
||
}
|
||
|
||
// closeWord writes the end of an encoded-word into buf.
|
||
func closeWord(buf *strings.Builder) {
|
||
buf.WriteString("?=")
|
||
}
|
||
|
||
// splitWord closes the current encoded-word and opens a new one.
|
||
func (e WordEncoder) splitWord(buf *strings.Builder, charset string) {
|
||
closeWord(buf)
|
||
buf.WriteByte(' ')
|
||
e.openWord(buf, charset)
|
||
}
|
||
|
||
func isUTF8(charset string) bool {
|
||
return strings.EqualFold(charset, "UTF-8")
|
||
}
|
||
|
||
const upperhex = "0123456789ABCDEF"
|
||
|
||
// A WordDecoder decodes MIME headers containing RFC 2047 encoded-words.
|
||
type WordDecoder struct {
|
||
// CharsetReader, if non-nil, defines a function to generate
|
||
// charset-conversion readers, converting from the provided
|
||
// charset into UTF-8.
|
||
// Charsets are always lower-case. utf-8, iso-8859-1 and us-ascii charsets
|
||
// are handled by default.
|
||
// One of the CharsetReader's result values must be non-nil.
|
||
CharsetReader func(charset string, input io.Reader) (io.Reader, error)
|
||
}
|
||
|
||
// Decode decodes an RFC 2047 encoded-word.
|
||
func (d *WordDecoder) Decode(word string) (string, error) {
|
||
// See https://tools.ietf.org/html/rfc2047#section-2 for details.
|
||
// Our decoder is permissive, we accept empty encoded-text.
|
||
if len(word) < 8 || !strings.HasPrefix(word, "=?") || !strings.HasSuffix(word, "?=") || strings.Count(word, "?") != 4 {
|
||
return "", errInvalidWord
|
||
}
|
||
word = word[2 : len(word)-2]
|
||
|
||
// split word "UTF-8?q?text" into "UTF-8", 'q', and "text"
|
||
charset, text, _ := strings.Cut(word, "?")
|
||
if charset == "" {
|
||
return "", errInvalidWord
|
||
}
|
||
encoding, text, _ := strings.Cut(text, "?")
|
||
if len(encoding) != 1 {
|
||
return "", errInvalidWord
|
||
}
|
||
|
||
content, err := decode(encoding[0], text)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
|
||
var buf strings.Builder
|
||
if err := d.convert(&buf, charset, content); err != nil {
|
||
return "", err
|
||
}
|
||
return buf.String(), nil
|
||
}
|
||
|
||
// DecodeHeader decodes all encoded-words of the given string. It returns an
|
||
// error if and only if CharsetReader of d returns an error.
|
||
func (d *WordDecoder) DecodeHeader(header string) (string, error) {
|
||
// If there is no encoded-word, returns before creating a buffer.
|
||
i := strings.Index(header, "=?")
|
||
if i == -1 {
|
||
return header, nil
|
||
}
|
||
|
||
var buf strings.Builder
|
||
|
||
buf.WriteString(header[:i])
|
||
header = header[i:]
|
||
|
||
betweenWords := false
|
||
for {
|
||
start := strings.Index(header, "=?")
|
||
if start == -1 {
|
||
break
|
||
}
|
||
cur := start + len("=?")
|
||
|
||
i := strings.Index(header[cur:], "?")
|
||
if i == -1 {
|
||
break
|
||
}
|
||
charset := header[cur : cur+i]
|
||
cur += i + len("?")
|
||
|
||
if len(header) < cur+len("Q??=") {
|
||
break
|
||
}
|
||
encoding := header[cur]
|
||
cur++
|
||
|
||
if header[cur] != '?' {
|
||
break
|
||
}
|
||
cur++
|
||
|
||
j := strings.Index(header[cur:], "?=")
|
||
if j == -1 {
|
||
break
|
||
}
|
||
text := header[cur : cur+j]
|
||
end := cur + j + len("?=")
|
||
|
||
content, err := decode(encoding, text)
|
||
if err != nil {
|
||
betweenWords = false
|
||
buf.WriteString(header[:start+2])
|
||
header = header[start+2:]
|
||
continue
|
||
}
|
||
|
||
// Write characters before the encoded-word. White-space and newline
|
||
// characters separating two encoded-words must be deleted.
|
||
if start > 0 && (!betweenWords || hasNonWhitespace(header[:start])) {
|
||
buf.WriteString(header[:start])
|
||
}
|
||
|
||
if err := d.convert(&buf, charset, content); err != nil {
|
||
return "", err
|
||
}
|
||
|
||
header = header[end:]
|
||
betweenWords = true
|
||
}
|
||
|
||
if len(header) > 0 {
|
||
buf.WriteString(header)
|
||
}
|
||
|
||
return buf.String(), nil
|
||
}
|
||
|
||
func decode(encoding byte, text string) ([]byte, error) {
|
||
switch encoding {
|
||
case 'B', 'b':
|
||
return base64.StdEncoding.DecodeString(text)
|
||
case 'Q', 'q':
|
||
return qDecode(text)
|
||
default:
|
||
return nil, errInvalidWord
|
||
}
|
||
}
|
||
|
||
func (d *WordDecoder) convert(buf *strings.Builder, charset string, content []byte) error {
|
||
switch {
|
||
case strings.EqualFold("utf-8", charset):
|
||
buf.Write(content)
|
||
case strings.EqualFold("iso-8859-1", charset):
|
||
for _, c := range content {
|
||
buf.WriteRune(rune(c))
|
||
}
|
||
case strings.EqualFold("us-ascii", charset):
|
||
for _, c := range content {
|
||
if c >= utf8.RuneSelf {
|
||
buf.WriteRune(unicode.ReplacementChar)
|
||
} else {
|
||
buf.WriteByte(c)
|
||
}
|
||
}
|
||
case strings.EqualFold("gb18030", charset):
|
||
decodeBytes, err := simplifiedchinese.GB18030.NewDecoder().Bytes(content)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
buf.Write(decodeBytes)
|
||
case strings.EqualFold("gbk", charset):
|
||
decodeBytes, err := simplifiedchinese.GBK.NewDecoder().Bytes(content)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
buf.Write(decodeBytes)
|
||
default:
|
||
if d.CharsetReader == nil {
|
||
return fmt.Errorf("mime: unhandled charset %q", charset)
|
||
}
|
||
r, err := d.CharsetReader(strings.ToLower(charset), bytes.NewReader(content))
|
||
if err != nil {
|
||
return err
|
||
}
|
||
if _, err = io.Copy(buf, r); err != nil {
|
||
return err
|
||
}
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// hasNonWhitespace reports whether s (assumed to be ASCII) contains at least
|
||
// one byte of non-whitespace.
|
||
func hasNonWhitespace(s string) bool {
|
||
for _, b := range s {
|
||
switch b {
|
||
// Encoded-words can only be separated by linear white spaces which does
|
||
// not include vertical tabs (\v).
|
||
case ' ', '\t', '\n', '\r':
|
||
default:
|
||
return true
|
||
}
|
||
}
|
||
return false
|
||
}
|
||
|
||
// qDecode decodes a Q encoded string.
|
||
func qDecode(s string) ([]byte, error) {
|
||
dec := make([]byte, len(s))
|
||
n := 0
|
||
for i := 0; i < len(s); i++ {
|
||
switch c := s[i]; {
|
||
case c == '_':
|
||
dec[n] = ' '
|
||
case c == '=':
|
||
if i+2 >= len(s) {
|
||
return nil, errInvalidWord
|
||
}
|
||
b, err := readHexByte(s[i+1], s[i+2])
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
dec[n] = b
|
||
i += 2
|
||
case (c <= '~' && c >= ' ') || c == '\n' || c == '\r' || c == '\t':
|
||
dec[n] = c
|
||
default:
|
||
return nil, errInvalidWord
|
||
}
|
||
n++
|
||
}
|
||
|
||
return dec[:n], nil
|
||
}
|
||
|
||
// readHexByte returns the byte from its quoted-printable representation.
|
||
func readHexByte(a, b byte) (byte, error) {
|
||
var hb, lb byte
|
||
var err error
|
||
if hb, err = fromHex(a); err != nil {
|
||
return 0, err
|
||
}
|
||
if lb, err = fromHex(b); err != nil {
|
||
return 0, err
|
||
}
|
||
return hb<<4 | lb, nil
|
||
}
|
||
|
||
func fromHex(b byte) (byte, error) {
|
||
switch {
|
||
case b >= '0' && b <= '9':
|
||
return b - '0', nil
|
||
case b >= 'A' && b <= 'F':
|
||
return b - 'A' + 10, nil
|
||
// Accept badly encoded bytes.
|
||
case b >= 'a' && b <= 'f':
|
||
return b - 'a' + 10, nil
|
||
}
|
||
return 0, fmt.Errorf("mime: invalid hex byte %#02x", b)
|
||
}
|