from collections import namedtuple import re import sys from ast import literal_eval from functools import total_ordering from parso._compatibility import unicode # The following is a list in Python that are line breaks in str.splitlines, but # not in Python. In Python only \r (Carriage Return, 0xD) and \n (Line Feed, # 0xA) are allowed to split lines. _NON_LINE_BREAKS = ( u'\v', # Vertical Tabulation 0xB u'\f', # Form Feed 0xC u'\x1C', # File Separator u'\x1D', # Group Separator u'\x1E', # Record Separator u'\x85', # Next Line (NEL - Equivalent to CR+LF. # Used to mark end-of-line on some IBM mainframes.) u'\u2028', # Line Separator u'\u2029', # Paragraph Separator ) Version = namedtuple('Version', 'major, minor, micro') def split_lines(string, keepends=False): r""" Intended for Python code. In contrast to Python's :py:meth:`str.splitlines`, looks at form feeds and other special characters as normal text. Just splits ``\n`` and ``\r\n``. Also different: Returns ``[""]`` for an empty string input. In Python 2.7 form feeds are used as normal characters when using str.splitlines. However in Python 3 somewhere there was a decision to split also on form feeds. """ if keepends: lst = string.splitlines(True) # We have to merge lines that were broken by form feed characters. merge = [] for i, line in enumerate(lst): try: last_chr = line[-1] except IndexError: pass else: if last_chr in _NON_LINE_BREAKS: merge.append(i) for index in reversed(merge): try: lst[index] = lst[index] + lst[index + 1] del lst[index + 1] except IndexError: # index + 1 can be empty and therefore there's no need to # merge. pass # The stdlib's implementation of the end is inconsistent when calling # it with/without keepends. One time there's an empty string in the # end, one time there's none. if string.endswith('\n') or string.endswith('\r') or string == '': lst.append('') return lst else: return re.split(r'\n|\r\n|\r', string) def python_bytes_to_unicode(source, encoding='utf-8', errors='strict'): """ Checks for unicode BOMs and PEP 263 encoding declarations. Then returns a unicode object like in :py:meth:`bytes.decode`. :param encoding: See :py:meth:`bytes.decode` documentation. :param errors: See :py:meth:`bytes.decode` documentation. ``errors`` can be ``'strict'``, ``'replace'`` or ``'ignore'``. """ def detect_encoding(): """ For the implementation of encoding definitions in Python, look at: - http://www.python.org/dev/peps/pep-0263/ - http://docs.python.org/2/reference/lexical_analysis.html#encoding-declarations """ byte_mark = literal_eval(r"b'\xef\xbb\xbf'") if source.startswith(byte_mark): # UTF-8 byte-order mark return 'utf-8' first_two_lines = re.match(br'(?:[^\n]*\n){0,2}', source).group(0) possible_encoding = re.search(br"coding[=:]\s*([-\w.]+)", first_two_lines) if possible_encoding: return possible_encoding.group(1) else: # the default if nothing else has been set -> PEP 263 return encoding if isinstance(source, unicode): # only cast str/bytes return source encoding = detect_encoding() if not isinstance(encoding, unicode): encoding = unicode(encoding, 'utf-8', 'replace') try: # Cast to unicode return unicode(source, encoding, errors) except LookupError: if errors == 'replace': # This is a weird case that can happen if the given encoding is not # a valid encoding. This usually shouldn't happen with provided # encodings, but can happen if somebody uses encoding declarations # like `# coding: foo-8`. return unicode(source, 'utf-8', errors) raise def version_info(): """ Returns a namedtuple of parso's version, similar to Python's ``sys.version_info``. """ from parso import __version__ tupl = re.findall(r'[a-z]+|\d+', __version__) return Version(*[x if i == 3 else int(x) for i, x in enumerate(tupl)]) def _parse_version(version): match = re.match(r'(\d+)(?:\.(\d{1,2})(?:\.\d+)?)?((a|b|rc)\d)?$', version) if match is None: raise ValueError('The given version is not in the right format. ' 'Use something like "3.8" or "3".') major = int(match.group(1)) minor = match.group(2) if minor is None: # Use the latest Python in case it's not exactly defined, because the # grammars are typically backwards compatible? if major == 2: minor = "7" elif major == 3: minor = "6" else: raise NotImplementedError("Sorry, no support yet for those fancy new/old versions.") minor = int(minor) return PythonVersionInfo(major, minor) @total_ordering class PythonVersionInfo(namedtuple('Version', 'major, minor')): def __gt__(self, other): if isinstance(other, tuple): if len(other) != 2: raise ValueError("Can only compare to tuples of length 2.") return (self.major, self.minor) > other super(PythonVersionInfo, self).__gt__(other) return (self.major, self.minor) def __eq__(self, other): if isinstance(other, tuple): if len(other) != 2: raise ValueError("Can only compare to tuples of length 2.") return (self.major, self.minor) == other super(PythonVersionInfo, self).__eq__(other) def __ne__(self, other): return not self.__eq__(other) def parse_version_string(version=None): """ Checks for a valid version number (e.g. `3.8` or `2.7.1` or `3`) and returns a corresponding version info that is always two characters long in decimal. """ if version is None: version = '%s.%s' % sys.version_info[:2] if not isinstance(version, (unicode, str)): raise TypeError('version must be a string like "3.8"') return _parse_version(version)