Uploaded Test files
This commit is contained in:
parent
f584ad9d97
commit
2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions
226
venv/Lib/site-packages/pygments/lexers/mime.py
Normal file
226
venv/Lib/site-packages/pygments/lexers/mime.py
Normal file
|
@ -0,0 +1,226 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
pygments.lexers.mime
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Lexer for Multipurpose Internet Mail Extensions (MIME) data.
|
||||
|
||||
:copyright: Copyright 2006-2020 by the Pygments team, see AUTHORS.
|
||||
:license: BSD, see LICENSE for details.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from pygments.lexer import RegexLexer, include
|
||||
from pygments.lexers import get_lexer_for_mimetype
|
||||
from pygments.token import Text, Name, String, Operator, Comment, Other
|
||||
from pygments.util import get_int_opt, ClassNotFound
|
||||
|
||||
__all__ = ["MIMELexer"]
|
||||
|
||||
|
||||
class MIMELexer(RegexLexer):
|
||||
"""
|
||||
Lexer for Multipurpose Internet Mail Extensions (MIME) data. This lexer is
|
||||
designed to process the nested mulitpart data.
|
||||
|
||||
It assumes that the given data contains both header and body (and is
|
||||
splitted by empty line). If no valid header is found, then the entire data
|
||||
would be treated as body.
|
||||
|
||||
Additional options accepted:
|
||||
|
||||
`MIME-max-level`
|
||||
Max recurssion level for nested MIME structure. Any negative number
|
||||
would treated as unlimited. (default: -1)
|
||||
|
||||
`Content-Type`
|
||||
Treat the data as specific content type. Useful when header is
|
||||
missing, or this lexer would try to parse from header. (default:
|
||||
`text/plain`)
|
||||
|
||||
`Multipart-Boundary`
|
||||
Set the default multipart boundary delimiter. This option is only used
|
||||
when `Content-Type` is `multipart` and header is missing. This lexer
|
||||
would try to parse from header by default. (default: None)
|
||||
|
||||
`Content-Transfer-Encoding`
|
||||
Treat the data as specific encoding. Or this lexer would try to parse
|
||||
from header by default. (default: None)
|
||||
|
||||
.. versionadded:: 2.5
|
||||
"""
|
||||
|
||||
name = "MIME"
|
||||
aliases = ["mime"]
|
||||
mimetypes = ["multipart/mixed",
|
||||
"multipart/related",
|
||||
"multipart/alternative"]
|
||||
|
||||
def __init__(self, **options):
|
||||
super().__init__(**options)
|
||||
self.boundary = options.get("Multipart-Boundary")
|
||||
self.content_transfer_encoding = options.get("Content_Transfer_Encoding")
|
||||
self.content_type = options.get("Content_Type", "text/plain")
|
||||
self.max_nested_level = get_int_opt(options, "MIME-max-level", -1)
|
||||
|
||||
def analyse_text(text):
|
||||
try:
|
||||
header, body = text.strip().split("\n\n", 1)
|
||||
if not body.strip():
|
||||
return 0.1
|
||||
|
||||
invalid_headers = MIMELexer.tokens["header"].sub("", header)
|
||||
if invalid_headers.strip():
|
||||
return 0.1
|
||||
else:
|
||||
return 1
|
||||
|
||||
except ValueError:
|
||||
return 0.1
|
||||
|
||||
def get_header_tokens(self, match):
|
||||
field = match.group(1)
|
||||
|
||||
if field.lower() in self.attention_headers:
|
||||
yield match.start(1), Name.Tag, field + ":"
|
||||
yield match.start(2), Text.Whitespace, match.group(2)
|
||||
|
||||
pos = match.end(2)
|
||||
body = match.group(3)
|
||||
for i, t, v in self.get_tokens_unprocessed(body, ("root", field.lower())):
|
||||
yield pos + i, t, v
|
||||
|
||||
else:
|
||||
yield match.start(), Comment, match.group()
|
||||
|
||||
def get_body_tokens(self, match):
|
||||
pos_body_start = match.start()
|
||||
entire_body = match.group()
|
||||
|
||||
# skip first newline
|
||||
if entire_body[0] == '\n':
|
||||
yield pos_body_start, Text.Whitespace, '\n'
|
||||
pos_body_start = pos_body_start + 1
|
||||
entire_body = entire_body[1:]
|
||||
|
||||
# if it is not a mulitpart
|
||||
if not self.content_type.startswith("multipart") or not self.boundary:
|
||||
for i, t, v in self.get_bodypart_tokens(entire_body):
|
||||
yield pos_body_start + i, t, v
|
||||
return
|
||||
|
||||
# find boundary
|
||||
bdry_pattern = r"^--%s(--)?\n" % re.escape(self.boundary)
|
||||
bdry_matcher = re.compile(bdry_pattern, re.MULTILINE)
|
||||
|
||||
# some data has prefix text before first boundary
|
||||
m = bdry_matcher.search(entire_body)
|
||||
if m:
|
||||
pos_part_start = pos_body_start + m.end()
|
||||
pos_iter_start = lpos_end = m.end()
|
||||
yield pos_body_start, Text, entire_body[:m.start()]
|
||||
yield pos_body_start + lpos_end, String.Delimiter, m.group()
|
||||
else:
|
||||
pos_part_start = pos_body_start
|
||||
pos_iter_start = 0
|
||||
|
||||
# process tokens of each body part
|
||||
for m in bdry_matcher.finditer(entire_body, pos_iter_start):
|
||||
# bodypart
|
||||
lpos_start = pos_part_start - pos_body_start
|
||||
lpos_end = m.start()
|
||||
part = entire_body[lpos_start:lpos_end]
|
||||
for i, t, v in self.get_bodypart_tokens(part):
|
||||
yield pos_part_start + i, t, v
|
||||
|
||||
# boundary
|
||||
yield pos_body_start + lpos_end, String.Delimiter, m.group()
|
||||
pos_part_start = pos_body_start + m.end()
|
||||
|
||||
# some data has suffix text after last boundary
|
||||
lpos_start = pos_part_start - pos_body_start
|
||||
if lpos_start != len(entire_body):
|
||||
yield pos_part_start, Text, entire_body[lpos_start:]
|
||||
|
||||
def get_bodypart_tokens(self, text):
|
||||
# return if:
|
||||
# * no content
|
||||
# * no content type specific
|
||||
# * content encoding is not readable
|
||||
# * max recurrsion exceed
|
||||
if not text.strip() or not self.content_type:
|
||||
return [(0, Other, text)]
|
||||
|
||||
cte = self.content_transfer_encoding
|
||||
if cte and cte not in {"8bit", "7bit", "quoted-printable"}:
|
||||
return [(0, Other, text)]
|
||||
|
||||
if self.max_nested_level == 0:
|
||||
return [(0, Other, text)]
|
||||
|
||||
# get lexer
|
||||
try:
|
||||
lexer = get_lexer_for_mimetype(self.content_type)
|
||||
except ClassNotFound:
|
||||
return [(0, Other, text)]
|
||||
|
||||
if isinstance(lexer, type(self)):
|
||||
lexer.max_nested_level = self.max_nested_level - 1
|
||||
|
||||
return lexer.get_tokens_unprocessed(text)
|
||||
|
||||
def store_content_type(self, match):
|
||||
self.content_type = match.group(1)
|
||||
|
||||
prefix_len = match.start(1) - match.start(0)
|
||||
yield match.start(0), Text.Whitespace, match.group(0)[:prefix_len]
|
||||
yield match.start(1), Name.Label, match.group(2)
|
||||
yield match.end(2), String.Delimiter, '/'
|
||||
yield match.start(3), Name.Label, match.group(3)
|
||||
|
||||
def get_content_type_subtokens(self, match):
|
||||
yield match.start(1), Text, match.group(1)
|
||||
yield match.start(2), Text.Whitespace, match.group(2)
|
||||
yield match.start(3), Name.Attribute, match.group(3)
|
||||
yield match.start(4), Operator, match.group(4)
|
||||
yield match.start(5), String, match.group(5)
|
||||
|
||||
if match.group(3).lower() == "boundary":
|
||||
boundary = match.group(5).strip()
|
||||
if boundary[0] == '"' and boundary[-1] == '"':
|
||||
boundary = boundary[1:-1]
|
||||
self.boundary = boundary
|
||||
|
||||
def store_content_transfer_encoding(self, match):
|
||||
self.content_transfer_encoding = match.group(0).lower()
|
||||
yield match.start(0), Name.Constant, match.group(0)
|
||||
|
||||
attention_headers = {"content-type", "content-transfer-encoding"}
|
||||
|
||||
tokens = {
|
||||
"root": [
|
||||
(r"^([\w-]+):( *)([\s\S]*?\n)(?![ \t])", get_header_tokens),
|
||||
(r"^$[\s\S]+", get_body_tokens),
|
||||
],
|
||||
"header": [
|
||||
# folding
|
||||
(r"\n[ \t]", Text.Whitespace),
|
||||
(r"\n(?![ \t])", Text.Whitespace, "#pop"),
|
||||
],
|
||||
"content-type": [
|
||||
include("header"),
|
||||
(
|
||||
r"^\s*((multipart|application|audio|font|image|model|text|video"
|
||||
r"|message)/([\w-]+))",
|
||||
store_content_type,
|
||||
),
|
||||
(r'(;)((?:[ \t]|\n[ \t])*)([\w:-]+)(=)([\s\S]*?)(?=;|\n(?![ \t]))',
|
||||
get_content_type_subtokens),
|
||||
(r';[ \t]*\n(?![ \t])', Text, '#pop'),
|
||||
],
|
||||
"content-transfer-encoding": [
|
||||
include("header"),
|
||||
(r"([\w-]+)", store_content_transfer_encoding),
|
||||
],
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue