Fixed database typo and removed unnecessary class identifier.
This commit is contained in:
		
							parent
							
								
									00ad49a143
								
							
						
					
					
						commit
						45fb349a7d
					
				
					 5098 changed files with 952558 additions and 85 deletions
				
			
		
							
								
								
									
										905
									
								
								venv/Lib/site-packages/scipy/io/arff/arffread.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										905
									
								
								venv/Lib/site-packages/scipy/io/arff/arffread.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,905 @@ | |||
| # Last Change: Mon Aug 20 08:00 PM 2007 J | ||||
| import re | ||||
| import datetime | ||||
| from collections import OrderedDict | ||||
| 
 | ||||
| import numpy as np | ||||
| 
 | ||||
| import csv | ||||
| import ctypes | ||||
| 
 | ||||
| """A module to read arff files.""" | ||||
| 
 | ||||
| __all__ = ['MetaData', 'loadarff', 'ArffError', 'ParseArffError'] | ||||
| 
 | ||||
| # An Arff file is basically two parts: | ||||
| #   - header | ||||
| #   - data | ||||
| # | ||||
| # A header has each of its components starting by @META where META is one of | ||||
| # the keyword (attribute of relation, for now). | ||||
| 
 | ||||
| # TODO: | ||||
| #   - both integer and reals are treated as numeric -> the integer info | ||||
| #    is lost! | ||||
| #   - Replace ValueError by ParseError or something | ||||
| 
 | ||||
| # We know can handle the following: | ||||
| #   - numeric and nominal attributes | ||||
| #   - missing values for numeric attributes | ||||
| 
 | ||||
| r_meta = re.compile(r'^\s*@') | ||||
| # Match a comment | ||||
| r_comment = re.compile(r'^%') | ||||
| # Match an empty line | ||||
| r_empty = re.compile(r'^\s+$') | ||||
| # Match a header line, that is a line which starts by @ + a word | ||||
| r_headerline = re.compile(r'^\s*@\S*') | ||||
| r_datameta = re.compile(r'^@[Dd][Aa][Tt][Aa]') | ||||
| r_relation = re.compile(r'^@[Rr][Ee][Ll][Aa][Tt][Ii][Oo][Nn]\s*(\S*)') | ||||
| r_attribute = re.compile(r'^\s*@[Aa][Tt][Tt][Rr][Ii][Bb][Uu][Tt][Ee]\s*(..*$)') | ||||
| 
 | ||||
| r_nominal = re.compile('{(.+)}') | ||||
| r_date = re.compile(r"[Dd][Aa][Tt][Ee]\s+[\"']?(.+?)[\"']?$") | ||||
| 
 | ||||
| # To get attributes name enclosed with '' | ||||
| r_comattrval = re.compile(r"'(..+)'\s+(..+$)") | ||||
| # To get normal attributes | ||||
| r_wcomattrval = re.compile(r"(\S+)\s+(..+$)") | ||||
| 
 | ||||
| # ------------------------ | ||||
| # Module defined exception | ||||
| # ------------------------ | ||||
| 
 | ||||
| 
 | ||||
| class ArffError(IOError): | ||||
|     pass | ||||
| 
 | ||||
| 
 | ||||
| class ParseArffError(ArffError): | ||||
|     pass | ||||
| 
 | ||||
| 
 | ||||
| # ---------- | ||||
| # Attributes | ||||
| # ---------- | ||||
| class Attribute(object): | ||||
| 
 | ||||
|     type_name = None | ||||
| 
 | ||||
|     def __init__(self, name): | ||||
|         self.name = name | ||||
|         self.range = None | ||||
|         self.dtype = np.object_ | ||||
| 
 | ||||
|     @classmethod | ||||
|     def parse_attribute(cls, name, attr_string): | ||||
|         """ | ||||
|         Parse the attribute line if it knows how. Returns the parsed | ||||
|         attribute, or None. | ||||
|         """ | ||||
|         return None | ||||
| 
 | ||||
|     def parse_data(self, data_str): | ||||
|         """ | ||||
|         Parse a value of this type. | ||||
|         """ | ||||
|         return None | ||||
| 
 | ||||
|     def __str__(self): | ||||
|         """ | ||||
|         Parse a value of this type. | ||||
|         """ | ||||
|         return self.name + ',' + self.type_name | ||||
| 
 | ||||
| 
 | ||||
| class NominalAttribute(Attribute): | ||||
| 
 | ||||
|     type_name = 'nominal' | ||||
| 
 | ||||
|     def __init__(self, name, values): | ||||
|         super().__init__(name) | ||||
|         self.values = values | ||||
|         self.range = values | ||||
|         self.dtype = (np.string_, max(len(i) for i in values)) | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def _get_nom_val(atrv): | ||||
|         """Given a string containing a nominal type, returns a tuple of the | ||||
|         possible values. | ||||
| 
 | ||||
|         A nominal type is defined as something framed between braces ({}). | ||||
| 
 | ||||
|         Parameters | ||||
|         ---------- | ||||
|         atrv : str | ||||
|            Nominal type definition | ||||
| 
 | ||||
|         Returns | ||||
|         ------- | ||||
|         poss_vals : tuple | ||||
|            possible values | ||||
| 
 | ||||
|         Examples | ||||
|         -------- | ||||
|         >>> get_nom_val("{floup, bouga, fl, ratata}") | ||||
|         ('floup', 'bouga', 'fl', 'ratata') | ||||
|         """ | ||||
|         m = r_nominal.match(atrv) | ||||
|         if m: | ||||
|             attrs, _ = split_data_line(m.group(1)) | ||||
|             return tuple(attrs) | ||||
|         else: | ||||
|             raise ValueError("This does not look like a nominal string") | ||||
| 
 | ||||
|     @classmethod | ||||
|     def parse_attribute(cls, name, attr_string): | ||||
|         """ | ||||
|         Parse the attribute line if it knows how. Returns the parsed | ||||
|         attribute, or None. | ||||
| 
 | ||||
|         For nominal attributes, the attribute string would be like '{<attr_1>, | ||||
|          <attr2>, <attr_3>}'. | ||||
|         """ | ||||
|         if attr_string[0] == '{': | ||||
|             values = cls._get_nom_val(attr_string) | ||||
|             return cls(name, values) | ||||
|         else: | ||||
|             return None | ||||
| 
 | ||||
|     def parse_data(self, data_str): | ||||
|         """ | ||||
|         Parse a value of this type. | ||||
|         """ | ||||
|         if data_str in self.values: | ||||
|             return data_str | ||||
|         elif data_str == '?': | ||||
|             return data_str | ||||
|         else: | ||||
|             raise ValueError("%s value not in %s" % (str(data_str), | ||||
|                                                      str(self.values))) | ||||
| 
 | ||||
|     def __str__(self): | ||||
|         msg = self.name + ",{" | ||||
|         for i in range(len(self.values)-1): | ||||
|             msg += self.values[i] + "," | ||||
|         msg += self.values[-1] | ||||
|         msg += "}" | ||||
|         return msg | ||||
| 
 | ||||
| 
 | ||||
| class NumericAttribute(Attribute): | ||||
| 
 | ||||
|     def __init__(self, name): | ||||
|         super().__init__(name) | ||||
|         self.type_name = 'numeric' | ||||
|         self.dtype = np.float_ | ||||
| 
 | ||||
|     @classmethod | ||||
|     def parse_attribute(cls, name, attr_string): | ||||
|         """ | ||||
|         Parse the attribute line if it knows how. Returns the parsed | ||||
|         attribute, or None. | ||||
| 
 | ||||
|         For numeric attributes, the attribute string would be like | ||||
|         'numeric' or 'int' or 'real'. | ||||
|         """ | ||||
| 
 | ||||
|         attr_string = attr_string.lower().strip() | ||||
| 
 | ||||
|         if(attr_string[:len('numeric')] == 'numeric' or | ||||
|            attr_string[:len('int')] == 'int' or | ||||
|            attr_string[:len('real')] == 'real'): | ||||
|             return cls(name) | ||||
|         else: | ||||
|             return None | ||||
| 
 | ||||
|     def parse_data(self, data_str): | ||||
|         """ | ||||
|         Parse a value of this type. | ||||
| 
 | ||||
|         Parameters | ||||
|         ---------- | ||||
|         data_str : str | ||||
|            string to convert | ||||
| 
 | ||||
|         Returns | ||||
|         ------- | ||||
|         f : float | ||||
|            where float can be nan | ||||
| 
 | ||||
|         Examples | ||||
|         -------- | ||||
|         >>> atr = NumericAttribute('atr') | ||||
|         >>> atr.parse_data('1') | ||||
|         1.0 | ||||
|         >>> atr.parse_data('1\\n') | ||||
|         1.0 | ||||
|         >>> atr.parse_data('?\\n') | ||||
|         nan | ||||
|         """ | ||||
|         if '?' in data_str: | ||||
|             return np.nan | ||||
|         else: | ||||
|             return float(data_str) | ||||
| 
 | ||||
|     def _basic_stats(self, data): | ||||
|         nbfac = data.size * 1. / (data.size - 1) | ||||
|         return (np.nanmin(data), np.nanmax(data), | ||||
|                 np.mean(data), np.std(data) * nbfac) | ||||
| 
 | ||||
| 
 | ||||
| class StringAttribute(Attribute): | ||||
| 
 | ||||
|     def __init__(self, name): | ||||
|         super().__init__(name) | ||||
|         self.type_name = 'string' | ||||
| 
 | ||||
|     @classmethod | ||||
|     def parse_attribute(cls, name, attr_string): | ||||
|         """ | ||||
|         Parse the attribute line if it knows how. Returns the parsed | ||||
|         attribute, or None. | ||||
| 
 | ||||
|         For string attributes, the attribute string would be like | ||||
|         'string'. | ||||
|         """ | ||||
| 
 | ||||
|         attr_string = attr_string.lower().strip() | ||||
| 
 | ||||
|         if attr_string[:len('string')] == 'string': | ||||
|             return cls(name) | ||||
|         else: | ||||
|             return None | ||||
| 
 | ||||
| 
 | ||||
| class DateAttribute(Attribute): | ||||
| 
 | ||||
|     def __init__(self, name, date_format, datetime_unit): | ||||
|         super().__init__(name) | ||||
|         self.date_format = date_format | ||||
|         self.datetime_unit = datetime_unit | ||||
|         self.type_name = 'date' | ||||
|         self.range = date_format | ||||
|         self.dtype = np.datetime64(0, self.datetime_unit) | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def _get_date_format(atrv): | ||||
|         m = r_date.match(atrv) | ||||
|         if m: | ||||
|             pattern = m.group(1).strip() | ||||
|             # convert time pattern from Java's SimpleDateFormat to C's format | ||||
|             datetime_unit = None | ||||
|             if "yyyy" in pattern: | ||||
|                 pattern = pattern.replace("yyyy", "%Y") | ||||
|                 datetime_unit = "Y" | ||||
|             elif "yy": | ||||
|                 pattern = pattern.replace("yy", "%y") | ||||
|                 datetime_unit = "Y" | ||||
|             if "MM" in pattern: | ||||
|                 pattern = pattern.replace("MM", "%m") | ||||
|                 datetime_unit = "M" | ||||
|             if "dd" in pattern: | ||||
|                 pattern = pattern.replace("dd", "%d") | ||||
|                 datetime_unit = "D" | ||||
|             if "HH" in pattern: | ||||
|                 pattern = pattern.replace("HH", "%H") | ||||
|                 datetime_unit = "h" | ||||
|             if "mm" in pattern: | ||||
|                 pattern = pattern.replace("mm", "%M") | ||||
|                 datetime_unit = "m" | ||||
|             if "ss" in pattern: | ||||
|                 pattern = pattern.replace("ss", "%S") | ||||
|                 datetime_unit = "s" | ||||
|             if "z" in pattern or "Z" in pattern: | ||||
|                 raise ValueError("Date type attributes with time zone not " | ||||
|                                  "supported, yet") | ||||
| 
 | ||||
|             if datetime_unit is None: | ||||
|                 raise ValueError("Invalid or unsupported date format") | ||||
| 
 | ||||
|             return pattern, datetime_unit | ||||
|         else: | ||||
|             raise ValueError("Invalid or no date format") | ||||
| 
 | ||||
|     @classmethod | ||||
|     def parse_attribute(cls, name, attr_string): | ||||
|         """ | ||||
|         Parse the attribute line if it knows how. Returns the parsed | ||||
|         attribute, or None. | ||||
| 
 | ||||
|         For date attributes, the attribute string would be like | ||||
|         'date <format>'. | ||||
|         """ | ||||
| 
 | ||||
|         attr_string_lower = attr_string.lower().strip() | ||||
| 
 | ||||
|         if attr_string_lower[:len('date')] == 'date': | ||||
|             date_format, datetime_unit = cls._get_date_format(attr_string) | ||||
|             return cls(name, date_format, datetime_unit) | ||||
|         else: | ||||
|             return None | ||||
| 
 | ||||
|     def parse_data(self, data_str): | ||||
|         """ | ||||
|         Parse a value of this type. | ||||
|         """ | ||||
|         date_str = data_str.strip().strip("'").strip('"') | ||||
|         if date_str == '?': | ||||
|             return np.datetime64('NaT', self.datetime_unit) | ||||
|         else: | ||||
|             dt = datetime.datetime.strptime(date_str, self.date_format) | ||||
|             return np.datetime64(dt).astype( | ||||
|                 "datetime64[%s]" % self.datetime_unit) | ||||
| 
 | ||||
|     def __str__(self): | ||||
|         return super(DateAttribute, self).__str__() + ',' + self.date_format | ||||
| 
 | ||||
| 
 | ||||
| class RelationalAttribute(Attribute): | ||||
| 
 | ||||
|     def __init__(self, name): | ||||
|         super().__init__(name) | ||||
|         self.type_name = 'relational' | ||||
|         self.dtype = np.object_ | ||||
|         self.attributes = [] | ||||
|         self.dialect = None | ||||
| 
 | ||||
|     @classmethod | ||||
|     def parse_attribute(cls, name, attr_string): | ||||
|         """ | ||||
|         Parse the attribute line if it knows how. Returns the parsed | ||||
|         attribute, or None. | ||||
| 
 | ||||
|         For date attributes, the attribute string would be like | ||||
|         'date <format>'. | ||||
|         """ | ||||
| 
 | ||||
|         attr_string_lower = attr_string.lower().strip() | ||||
| 
 | ||||
|         if attr_string_lower[:len('relational')] == 'relational': | ||||
|             return cls(name) | ||||
|         else: | ||||
|             return None | ||||
| 
 | ||||
|     def parse_data(self, data_str): | ||||
|         # Copy-pasted | ||||
|         elems = list(range(len(self.attributes))) | ||||
| 
 | ||||
|         escaped_string = data_str.encode().decode("unicode-escape") | ||||
| 
 | ||||
|         row_tuples = [] | ||||
| 
 | ||||
|         for raw in escaped_string.split("\n"): | ||||
|             row, self.dialect = split_data_line(raw, self.dialect) | ||||
| 
 | ||||
|             row_tuples.append(tuple( | ||||
|                 [self.attributes[i].parse_data(row[i]) for i in elems])) | ||||
| 
 | ||||
|         return np.array(row_tuples, | ||||
|                         [(a.name, a.dtype) for a in self.attributes]) | ||||
| 
 | ||||
|     def __str__(self): | ||||
|         return (super(RelationalAttribute, self).__str__() + '\n\t' + | ||||
|                 '\n\t'.join(str(a) for a in self.attributes)) | ||||
| 
 | ||||
| 
 | ||||
| # ----------------- | ||||
| # Various utilities | ||||
| # ----------------- | ||||
| def to_attribute(name, attr_string): | ||||
|     attr_classes = (NominalAttribute, NumericAttribute, DateAttribute, | ||||
|                     StringAttribute, RelationalAttribute) | ||||
| 
 | ||||
|     for cls in attr_classes: | ||||
|         attr = cls.parse_attribute(name, attr_string) | ||||
|         if attr is not None: | ||||
|             return attr | ||||
| 
 | ||||
|     raise ParseArffError("unknown attribute %s" % attr_string) | ||||
| 
 | ||||
| 
 | ||||
| def csv_sniffer_has_bug_last_field(): | ||||
|     """ | ||||
|     Checks if the bug https://bugs.python.org/issue30157 is unpatched. | ||||
|     """ | ||||
| 
 | ||||
|     # We only compute this once. | ||||
|     has_bug = getattr(csv_sniffer_has_bug_last_field, "has_bug", None) | ||||
| 
 | ||||
|     if has_bug is None: | ||||
|         dialect = csv.Sniffer().sniff("3, 'a'") | ||||
|         csv_sniffer_has_bug_last_field.has_bug = dialect.quotechar != "'" | ||||
|         has_bug = csv_sniffer_has_bug_last_field.has_bug | ||||
| 
 | ||||
|     return has_bug | ||||
| 
 | ||||
| 
 | ||||
| def workaround_csv_sniffer_bug_last_field(sniff_line, dialect, delimiters): | ||||
|     """ | ||||
|     Workaround for the bug https://bugs.python.org/issue30157 if is unpatched. | ||||
|     """ | ||||
|     if csv_sniffer_has_bug_last_field(): | ||||
|         # Reuses code from the csv module | ||||
|         right_regex = r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)' | ||||
| 
 | ||||
|         for restr in (r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)',  # ,".*?", | ||||
|                       r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',  # .*?", | ||||
|                       right_regex,  # ,".*?" | ||||
|                       r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):  # ".*?" (no delim, no space) | ||||
|             regexp = re.compile(restr, re.DOTALL | re.MULTILINE) | ||||
|             matches = regexp.findall(sniff_line) | ||||
|             if matches: | ||||
|                 break | ||||
| 
 | ||||
|         # If it does not match the expression that was bugged, then this bug does not apply | ||||
|         if restr != right_regex: | ||||
|             return | ||||
| 
 | ||||
|         groupindex = regexp.groupindex | ||||
| 
 | ||||
|         # There is only one end of the string | ||||
|         assert len(matches) == 1 | ||||
|         m = matches[0] | ||||
| 
 | ||||
|         n = groupindex['quote'] - 1 | ||||
|         quote = m[n] | ||||
| 
 | ||||
|         n = groupindex['delim'] - 1 | ||||
|         delim = m[n] | ||||
| 
 | ||||
|         n = groupindex['space'] - 1 | ||||
|         space = bool(m[n]) | ||||
| 
 | ||||
|         dq_regexp = re.compile( | ||||
|             r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % | ||||
|             {'delim': re.escape(delim), 'quote': quote}, re.MULTILINE | ||||
|         ) | ||||
| 
 | ||||
|         doublequote = bool(dq_regexp.search(sniff_line)) | ||||
| 
 | ||||
|         dialect.quotechar = quote | ||||
|         if delim in delimiters: | ||||
|             dialect.delimiter = delim | ||||
|         dialect.doublequote = doublequote | ||||
|         dialect.skipinitialspace = space | ||||
| 
 | ||||
| 
 | ||||
| def split_data_line(line, dialect=None): | ||||
|     delimiters = ",\t" | ||||
| 
 | ||||
|     # This can not be done in a per reader basis, and relational fields | ||||
|     # can be HUGE | ||||
|     csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2)) | ||||
| 
 | ||||
|     # Remove the line end if any | ||||
|     if line[-1] == '\n': | ||||
|         line = line[:-1] | ||||
| 
 | ||||
|     sniff_line = line | ||||
| 
 | ||||
|     # Add a delimiter if none is present, so that the csv.Sniffer | ||||
|     # does not complain for a single-field CSV. | ||||
|     if not any(d in line for d in delimiters): | ||||
|         sniff_line += "," | ||||
| 
 | ||||
|     if dialect is None: | ||||
|         dialect = csv.Sniffer().sniff(sniff_line, delimiters=delimiters) | ||||
|         workaround_csv_sniffer_bug_last_field(sniff_line=sniff_line, | ||||
|                                               dialect=dialect, | ||||
|                                               delimiters=delimiters) | ||||
| 
 | ||||
|     row = next(csv.reader([line], dialect)) | ||||
| 
 | ||||
|     return row, dialect | ||||
| 
 | ||||
| 
 | ||||
| # -------------- | ||||
| # Parsing header | ||||
| # -------------- | ||||
| def tokenize_attribute(iterable, attribute): | ||||
|     """Parse a raw string in header (e.g., starts by @attribute). | ||||
| 
 | ||||
|     Given a raw string attribute, try to get the name and type of the | ||||
|     attribute. Constraints: | ||||
| 
 | ||||
|     * The first line must start with @attribute (case insensitive, and | ||||
|       space like characters before @attribute are allowed) | ||||
|     * Works also if the attribute is spread on multilines. | ||||
|     * Works if empty lines or comments are in between | ||||
| 
 | ||||
|     Parameters | ||||
|     ---------- | ||||
|     attribute : str | ||||
|        the attribute string. | ||||
| 
 | ||||
|     Returns | ||||
|     ------- | ||||
|     name : str | ||||
|        name of the attribute | ||||
|     value : str | ||||
|        value of the attribute | ||||
|     next : str | ||||
|        next line to be parsed | ||||
| 
 | ||||
|     Examples | ||||
|     -------- | ||||
|     If attribute is a string defined in python as r"floupi real", will | ||||
|     return floupi as name, and real as value. | ||||
| 
 | ||||
|     >>> iterable = iter([0] * 10) # dummy iterator | ||||
|     >>> tokenize_attribute(iterable, r"@attribute floupi real") | ||||
|     ('floupi', 'real', 0) | ||||
| 
 | ||||
|     If attribute is r"'floupi 2' real", will return 'floupi 2' as name, | ||||
|     and real as value. | ||||
| 
 | ||||
|     >>> tokenize_attribute(iterable, r"  @attribute 'floupi 2' real   ") | ||||
|     ('floupi 2', 'real', 0) | ||||
| 
 | ||||
|     """ | ||||
|     sattr = attribute.strip() | ||||
|     mattr = r_attribute.match(sattr) | ||||
|     if mattr: | ||||
|         # atrv is everything after @attribute | ||||
|         atrv = mattr.group(1) | ||||
|         if r_comattrval.match(atrv): | ||||
|             name, type = tokenize_single_comma(atrv) | ||||
|             next_item = next(iterable) | ||||
|         elif r_wcomattrval.match(atrv): | ||||
|             name, type = tokenize_single_wcomma(atrv) | ||||
|             next_item = next(iterable) | ||||
|         else: | ||||
|             # Not sure we should support this, as it does not seem supported by | ||||
|             # weka. | ||||
|             raise ValueError("multi line not supported yet") | ||||
|     else: | ||||
|         raise ValueError("First line unparsable: %s" % sattr) | ||||
| 
 | ||||
|     attribute = to_attribute(name, type) | ||||
| 
 | ||||
|     if type.lower() == 'relational': | ||||
|         next_item = read_relational_attribute(iterable, attribute, next_item) | ||||
|     #    raise ValueError("relational attributes not supported yet") | ||||
| 
 | ||||
|     return attribute, next_item | ||||
| 
 | ||||
| 
 | ||||
| def tokenize_single_comma(val): | ||||
|     # XXX we match twice the same string (here and at the caller level). It is | ||||
|     # stupid, but it is easier for now... | ||||
|     m = r_comattrval.match(val) | ||||
|     if m: | ||||
|         try: | ||||
|             name = m.group(1).strip() | ||||
|             type = m.group(2).strip() | ||||
|         except IndexError: | ||||
|             raise ValueError("Error while tokenizing attribute") | ||||
|     else: | ||||
|         raise ValueError("Error while tokenizing single %s" % val) | ||||
|     return name, type | ||||
| 
 | ||||
| 
 | ||||
| def tokenize_single_wcomma(val): | ||||
|     # XXX we match twice the same string (here and at the caller level). It is | ||||
|     # stupid, but it is easier for now... | ||||
|     m = r_wcomattrval.match(val) | ||||
|     if m: | ||||
|         try: | ||||
|             name = m.group(1).strip() | ||||
|             type = m.group(2).strip() | ||||
|         except IndexError: | ||||
|             raise ValueError("Error while tokenizing attribute") | ||||
|     else: | ||||
|         raise ValueError("Error while tokenizing single %s" % val) | ||||
|     return name, type | ||||
| 
 | ||||
| 
 | ||||
| def read_relational_attribute(ofile, relational_attribute, i): | ||||
|     """Read the nested attributes of a relational attribute""" | ||||
| 
 | ||||
|     r_end_relational = re.compile(r'^@[Ee][Nn][Dd]\s*' + | ||||
|                                   relational_attribute.name + r'\s*$') | ||||
| 
 | ||||
|     while not r_end_relational.match(i): | ||||
|         m = r_headerline.match(i) | ||||
|         if m: | ||||
|             isattr = r_attribute.match(i) | ||||
|             if isattr: | ||||
|                 attr, i = tokenize_attribute(ofile, i) | ||||
|                 relational_attribute.attributes.append(attr) | ||||
|             else: | ||||
|                 raise ValueError("Error parsing line %s" % i) | ||||
|         else: | ||||
|             i = next(ofile) | ||||
| 
 | ||||
|     i = next(ofile) | ||||
|     return i | ||||
| 
 | ||||
| 
 | ||||
| def read_header(ofile): | ||||
|     """Read the header of the iterable ofile.""" | ||||
|     i = next(ofile) | ||||
| 
 | ||||
|     # Pass first comments | ||||
|     while r_comment.match(i): | ||||
|         i = next(ofile) | ||||
| 
 | ||||
|     # Header is everything up to DATA attribute ? | ||||
|     relation = None | ||||
|     attributes = [] | ||||
|     while not r_datameta.match(i): | ||||
|         m = r_headerline.match(i) | ||||
|         if m: | ||||
|             isattr = r_attribute.match(i) | ||||
|             if isattr: | ||||
|                 attr, i = tokenize_attribute(ofile, i) | ||||
|                 attributes.append(attr) | ||||
|             else: | ||||
|                 isrel = r_relation.match(i) | ||||
|                 if isrel: | ||||
|                     relation = isrel.group(1) | ||||
|                 else: | ||||
|                     raise ValueError("Error parsing line %s" % i) | ||||
|                 i = next(ofile) | ||||
|         else: | ||||
|             i = next(ofile) | ||||
| 
 | ||||
|     return relation, attributes | ||||
| 
 | ||||
| 
 | ||||
| class MetaData(object): | ||||
|     """Small container to keep useful information on a ARFF dataset. | ||||
| 
 | ||||
|     Knows about attributes names and types. | ||||
| 
 | ||||
|     Examples | ||||
|     -------- | ||||
|     :: | ||||
| 
 | ||||
|         data, meta = loadarff('iris.arff') | ||||
|         # This will print the attributes names of the iris.arff dataset | ||||
|         for i in meta: | ||||
|             print(i) | ||||
|         # This works too | ||||
|         meta.names() | ||||
|         # Getting attribute type | ||||
|         types = meta.types() | ||||
| 
 | ||||
|     Methods | ||||
|     ------- | ||||
|     names | ||||
|     types | ||||
| 
 | ||||
|     Notes | ||||
|     ----- | ||||
|     Also maintains the list of attributes in order, i.e., doing for i in | ||||
|     meta, where meta is an instance of MetaData, will return the | ||||
|     different attribute names in the order they were defined. | ||||
|     """ | ||||
|     def __init__(self, rel, attr): | ||||
|         self.name = rel | ||||
| 
 | ||||
|         # We need the dictionary to be ordered | ||||
|         self._attributes = OrderedDict((a.name, a) for a in attr) | ||||
| 
 | ||||
|     def __repr__(self): | ||||
|         msg = "" | ||||
|         msg += "Dataset: %s\n" % self.name | ||||
|         for i in self._attributes: | ||||
|             msg += "\t%s's type is %s" % (i, self._attributes[i].type_name) | ||||
|             if self._attributes[i].range: | ||||
|                 msg += ", range is %s" % str(self._attributes[i].range) | ||||
|             msg += '\n' | ||||
|         return msg | ||||
| 
 | ||||
|     def __iter__(self): | ||||
|         return iter(self._attributes) | ||||
| 
 | ||||
|     def __getitem__(self, key): | ||||
|         attr = self._attributes[key] | ||||
| 
 | ||||
|         return (attr.type_name, attr.range) | ||||
| 
 | ||||
|     def names(self): | ||||
|         """Return the list of attribute names. | ||||
| 
 | ||||
|         Returns | ||||
|         ------- | ||||
|         attrnames : list of str | ||||
|             The attribute names. | ||||
|         """ | ||||
|         return list(self._attributes) | ||||
| 
 | ||||
|     def types(self): | ||||
|         """Return the list of attribute types. | ||||
| 
 | ||||
|         Returns | ||||
|         ------- | ||||
|         attr_types : list of str | ||||
|             The attribute types. | ||||
|         """ | ||||
|         attr_types = [self._attributes[name].type_name | ||||
|                       for name in self._attributes] | ||||
|         return attr_types | ||||
| 
 | ||||
| 
 | ||||
| def loadarff(f): | ||||
|     """ | ||||
|     Read an arff file. | ||||
| 
 | ||||
|     The data is returned as a record array, which can be accessed much like | ||||
|     a dictionary of NumPy arrays. For example, if one of the attributes is | ||||
|     called 'pressure', then its first 10 data points can be accessed from the | ||||
|     ``data`` record array like so: ``data['pressure'][0:10]`` | ||||
| 
 | ||||
| 
 | ||||
|     Parameters | ||||
|     ---------- | ||||
|     f : file-like or str | ||||
|        File-like object to read from, or filename to open. | ||||
| 
 | ||||
|     Returns | ||||
|     ------- | ||||
|     data : record array | ||||
|        The data of the arff file, accessible by attribute names. | ||||
|     meta : `MetaData` | ||||
|        Contains information about the arff file such as name and | ||||
|        type of attributes, the relation (name of the dataset), etc. | ||||
| 
 | ||||
|     Raises | ||||
|     ------ | ||||
|     ParseArffError | ||||
|         This is raised if the given file is not ARFF-formatted. | ||||
|     NotImplementedError | ||||
|         The ARFF file has an attribute which is not supported yet. | ||||
| 
 | ||||
|     Notes | ||||
|     ----- | ||||
| 
 | ||||
|     This function should be able to read most arff files. Not | ||||
|     implemented functionality include: | ||||
| 
 | ||||
|     * date type attributes | ||||
|     * string type attributes | ||||
| 
 | ||||
|     It can read files with numeric and nominal attributes. It cannot read | ||||
|     files with sparse data ({} in the file). However, this function can | ||||
|     read files with missing data (? in the file), representing the data | ||||
|     points as NaNs. | ||||
| 
 | ||||
|     Examples | ||||
|     -------- | ||||
|     >>> from scipy.io import arff | ||||
|     >>> from io import StringIO | ||||
|     >>> content = \"\"\" | ||||
|     ... @relation foo | ||||
|     ... @attribute width  numeric | ||||
|     ... @attribute height numeric | ||||
|     ... @attribute color  {red,green,blue,yellow,black} | ||||
|     ... @data | ||||
|     ... 5.0,3.25,blue | ||||
|     ... 4.5,3.75,green | ||||
|     ... 3.0,4.00,red | ||||
|     ... \"\"\" | ||||
|     >>> f = StringIO(content) | ||||
|     >>> data, meta = arff.loadarff(f) | ||||
|     >>> data | ||||
|     array([(5.0, 3.25, 'blue'), (4.5, 3.75, 'green'), (3.0, 4.0, 'red')], | ||||
|           dtype=[('width', '<f8'), ('height', '<f8'), ('color', '|S6')]) | ||||
|     >>> meta | ||||
|     Dataset: foo | ||||
|     \twidth's type is numeric | ||||
|     \theight's type is numeric | ||||
|     \tcolor's type is nominal, range is ('red', 'green', 'blue', 'yellow', 'black') | ||||
| 
 | ||||
|     """ | ||||
|     if hasattr(f, 'read'): | ||||
|         ofile = f | ||||
|     else: | ||||
|         ofile = open(f, 'rt') | ||||
|     try: | ||||
|         return _loadarff(ofile) | ||||
|     finally: | ||||
|         if ofile is not f:  # only close what we opened | ||||
|             ofile.close() | ||||
| 
 | ||||
| 
 | ||||
| def _loadarff(ofile): | ||||
|     # Parse the header file | ||||
|     try: | ||||
|         rel, attr = read_header(ofile) | ||||
|     except ValueError as e: | ||||
|         msg = "Error while parsing header, error was: " + str(e) | ||||
|         raise ParseArffError(msg) | ||||
| 
 | ||||
|     # Check whether we have a string attribute (not supported yet) | ||||
|     hasstr = False | ||||
|     for a in attr: | ||||
|         if isinstance(a, StringAttribute): | ||||
|             hasstr = True | ||||
| 
 | ||||
|     meta = MetaData(rel, attr) | ||||
| 
 | ||||
|     # XXX The following code is not great | ||||
|     # Build the type descriptor descr and the list of convertors to convert | ||||
|     # each attribute to the suitable type (which should match the one in | ||||
|     # descr). | ||||
| 
 | ||||
|     # This can be used once we want to support integer as integer values and | ||||
|     # not as numeric anymore (using masked arrays ?). | ||||
| 
 | ||||
|     if hasstr: | ||||
|         # How to support string efficiently ? Ideally, we should know the max | ||||
|         # size of the string before allocating the numpy array. | ||||
|         raise NotImplementedError("String attributes not supported yet, sorry") | ||||
| 
 | ||||
|     ni = len(attr) | ||||
| 
 | ||||
|     def generator(row_iter, delim=','): | ||||
|         # TODO: this is where we are spending time (~80%). I think things | ||||
|         # could be made more efficiently: | ||||
|         #   - We could for example "compile" the function, because some values | ||||
|         #   do not change here. | ||||
|         #   - The function to convert a line to dtyped values could also be | ||||
|         #   generated on the fly from a string and be executed instead of | ||||
|         #   looping. | ||||
|         #   - The regex are overkill: for comments, checking that a line starts | ||||
|         #   by % should be enough and faster, and for empty lines, same thing | ||||
|         #   --> this does not seem to change anything. | ||||
| 
 | ||||
|         # 'compiling' the range since it does not change | ||||
|         # Note, I have already tried zipping the converters and | ||||
|         # row elements and got slightly worse performance. | ||||
|         elems = list(range(ni)) | ||||
| 
 | ||||
|         dialect = None | ||||
|         for raw in row_iter: | ||||
|             # We do not abstract skipping comments and empty lines for | ||||
|             # performance reasons. | ||||
|             if r_comment.match(raw) or r_empty.match(raw): | ||||
|                 continue | ||||
| 
 | ||||
|             row, dialect = split_data_line(raw, dialect) | ||||
| 
 | ||||
|             yield tuple([attr[i].parse_data(row[i]) for i in elems]) | ||||
| 
 | ||||
|     a = list(generator(ofile)) | ||||
|     # No error should happen here: it is a bug otherwise | ||||
|     data = np.array(a, [(a.name, a.dtype) for a in attr]) | ||||
|     return data, meta | ||||
| 
 | ||||
| 
 | ||||
| # ---- | ||||
| # Misc | ||||
| # ---- | ||||
| def basic_stats(data): | ||||
|     nbfac = data.size * 1. / (data.size - 1) | ||||
|     return np.nanmin(data), np.nanmax(data), np.mean(data), np.std(data) * nbfac | ||||
| 
 | ||||
| 
 | ||||
| def print_attribute(name, tp, data): | ||||
|     type = tp.type_name | ||||
|     if type == 'numeric' or type == 'real' or type == 'integer': | ||||
|         min, max, mean, std = basic_stats(data) | ||||
|         print("%s,%s,%f,%f,%f,%f" % (name, type, min, max, mean, std)) | ||||
|     else: | ||||
|         print(str(tp)) | ||||
| 
 | ||||
| 
 | ||||
| def test_weka(filename): | ||||
|     data, meta = loadarff(filename) | ||||
|     print(len(data.dtype)) | ||||
|     print(data.size) | ||||
|     for i in meta: | ||||
|         print_attribute(i, meta[i], data[i]) | ||||
| 
 | ||||
| 
 | ||||
| # make sure nose does not find this as a test | ||||
| test_weka.__test__ = False | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     import sys | ||||
|     filename = sys.argv[1] | ||||
|     test_weka(filename) | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue