Fixed database typo and removed unnecessary class identifier.
This commit is contained in:
parent
00ad49a143
commit
45fb349a7d
5098 changed files with 952558 additions and 85 deletions
905
venv/Lib/site-packages/scipy/io/arff/arffread.py
Normal file
905
venv/Lib/site-packages/scipy/io/arff/arffread.py
Normal file
|
@ -0,0 +1,905 @@
|
|||
# Last Change: Mon Aug 20 08:00 PM 2007 J
|
||||
import re
|
||||
import datetime
|
||||
from collections import OrderedDict
|
||||
|
||||
import numpy as np
|
||||
|
||||
import csv
|
||||
import ctypes
|
||||
|
||||
"""A module to read arff files."""
|
||||
|
||||
__all__ = ['MetaData', 'loadarff', 'ArffError', 'ParseArffError']
|
||||
|
||||
# An Arff file is basically two parts:
|
||||
# - header
|
||||
# - data
|
||||
#
|
||||
# A header has each of its components starting by @META where META is one of
|
||||
# the keyword (attribute of relation, for now).
|
||||
|
||||
# TODO:
|
||||
# - both integer and reals are treated as numeric -> the integer info
|
||||
# is lost!
|
||||
# - Replace ValueError by ParseError or something
|
||||
|
||||
# We know can handle the following:
|
||||
# - numeric and nominal attributes
|
||||
# - missing values for numeric attributes
|
||||
|
||||
r_meta = re.compile(r'^\s*@')
|
||||
# Match a comment
|
||||
r_comment = re.compile(r'^%')
|
||||
# Match an empty line
|
||||
r_empty = re.compile(r'^\s+$')
|
||||
# Match a header line, that is a line which starts by @ + a word
|
||||
r_headerline = re.compile(r'^\s*@\S*')
|
||||
r_datameta = re.compile(r'^@[Dd][Aa][Tt][Aa]')
|
||||
r_relation = re.compile(r'^@[Rr][Ee][Ll][Aa][Tt][Ii][Oo][Nn]\s*(\S*)')
|
||||
r_attribute = re.compile(r'^\s*@[Aa][Tt][Tt][Rr][Ii][Bb][Uu][Tt][Ee]\s*(..*$)')
|
||||
|
||||
r_nominal = re.compile('{(.+)}')
|
||||
r_date = re.compile(r"[Dd][Aa][Tt][Ee]\s+[\"']?(.+?)[\"']?$")
|
||||
|
||||
# To get attributes name enclosed with ''
|
||||
r_comattrval = re.compile(r"'(..+)'\s+(..+$)")
|
||||
# To get normal attributes
|
||||
r_wcomattrval = re.compile(r"(\S+)\s+(..+$)")
|
||||
|
||||
# ------------------------
|
||||
# Module defined exception
|
||||
# ------------------------
|
||||
|
||||
|
||||
class ArffError(IOError):
|
||||
pass
|
||||
|
||||
|
||||
class ParseArffError(ArffError):
|
||||
pass
|
||||
|
||||
|
||||
# ----------
|
||||
# Attributes
|
||||
# ----------
|
||||
class Attribute(object):
|
||||
|
||||
type_name = None
|
||||
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
self.range = None
|
||||
self.dtype = np.object_
|
||||
|
||||
@classmethod
|
||||
def parse_attribute(cls, name, attr_string):
|
||||
"""
|
||||
Parse the attribute line if it knows how. Returns the parsed
|
||||
attribute, or None.
|
||||
"""
|
||||
return None
|
||||
|
||||
def parse_data(self, data_str):
|
||||
"""
|
||||
Parse a value of this type.
|
||||
"""
|
||||
return None
|
||||
|
||||
def __str__(self):
|
||||
"""
|
||||
Parse a value of this type.
|
||||
"""
|
||||
return self.name + ',' + self.type_name
|
||||
|
||||
|
||||
class NominalAttribute(Attribute):
|
||||
|
||||
type_name = 'nominal'
|
||||
|
||||
def __init__(self, name, values):
|
||||
super().__init__(name)
|
||||
self.values = values
|
||||
self.range = values
|
||||
self.dtype = (np.string_, max(len(i) for i in values))
|
||||
|
||||
@staticmethod
|
||||
def _get_nom_val(atrv):
|
||||
"""Given a string containing a nominal type, returns a tuple of the
|
||||
possible values.
|
||||
|
||||
A nominal type is defined as something framed between braces ({}).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
atrv : str
|
||||
Nominal type definition
|
||||
|
||||
Returns
|
||||
-------
|
||||
poss_vals : tuple
|
||||
possible values
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> get_nom_val("{floup, bouga, fl, ratata}")
|
||||
('floup', 'bouga', 'fl', 'ratata')
|
||||
"""
|
||||
m = r_nominal.match(atrv)
|
||||
if m:
|
||||
attrs, _ = split_data_line(m.group(1))
|
||||
return tuple(attrs)
|
||||
else:
|
||||
raise ValueError("This does not look like a nominal string")
|
||||
|
||||
@classmethod
|
||||
def parse_attribute(cls, name, attr_string):
|
||||
"""
|
||||
Parse the attribute line if it knows how. Returns the parsed
|
||||
attribute, or None.
|
||||
|
||||
For nominal attributes, the attribute string would be like '{<attr_1>,
|
||||
<attr2>, <attr_3>}'.
|
||||
"""
|
||||
if attr_string[0] == '{':
|
||||
values = cls._get_nom_val(attr_string)
|
||||
return cls(name, values)
|
||||
else:
|
||||
return None
|
||||
|
||||
def parse_data(self, data_str):
|
||||
"""
|
||||
Parse a value of this type.
|
||||
"""
|
||||
if data_str in self.values:
|
||||
return data_str
|
||||
elif data_str == '?':
|
||||
return data_str
|
||||
else:
|
||||
raise ValueError("%s value not in %s" % (str(data_str),
|
||||
str(self.values)))
|
||||
|
||||
def __str__(self):
|
||||
msg = self.name + ",{"
|
||||
for i in range(len(self.values)-1):
|
||||
msg += self.values[i] + ","
|
||||
msg += self.values[-1]
|
||||
msg += "}"
|
||||
return msg
|
||||
|
||||
|
||||
class NumericAttribute(Attribute):
|
||||
|
||||
def __init__(self, name):
|
||||
super().__init__(name)
|
||||
self.type_name = 'numeric'
|
||||
self.dtype = np.float_
|
||||
|
||||
@classmethod
|
||||
def parse_attribute(cls, name, attr_string):
|
||||
"""
|
||||
Parse the attribute line if it knows how. Returns the parsed
|
||||
attribute, or None.
|
||||
|
||||
For numeric attributes, the attribute string would be like
|
||||
'numeric' or 'int' or 'real'.
|
||||
"""
|
||||
|
||||
attr_string = attr_string.lower().strip()
|
||||
|
||||
if(attr_string[:len('numeric')] == 'numeric' or
|
||||
attr_string[:len('int')] == 'int' or
|
||||
attr_string[:len('real')] == 'real'):
|
||||
return cls(name)
|
||||
else:
|
||||
return None
|
||||
|
||||
def parse_data(self, data_str):
|
||||
"""
|
||||
Parse a value of this type.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data_str : str
|
||||
string to convert
|
||||
|
||||
Returns
|
||||
-------
|
||||
f : float
|
||||
where float can be nan
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> atr = NumericAttribute('atr')
|
||||
>>> atr.parse_data('1')
|
||||
1.0
|
||||
>>> atr.parse_data('1\\n')
|
||||
1.0
|
||||
>>> atr.parse_data('?\\n')
|
||||
nan
|
||||
"""
|
||||
if '?' in data_str:
|
||||
return np.nan
|
||||
else:
|
||||
return float(data_str)
|
||||
|
||||
def _basic_stats(self, data):
|
||||
nbfac = data.size * 1. / (data.size - 1)
|
||||
return (np.nanmin(data), np.nanmax(data),
|
||||
np.mean(data), np.std(data) * nbfac)
|
||||
|
||||
|
||||
class StringAttribute(Attribute):
|
||||
|
||||
def __init__(self, name):
|
||||
super().__init__(name)
|
||||
self.type_name = 'string'
|
||||
|
||||
@classmethod
|
||||
def parse_attribute(cls, name, attr_string):
|
||||
"""
|
||||
Parse the attribute line if it knows how. Returns the parsed
|
||||
attribute, or None.
|
||||
|
||||
For string attributes, the attribute string would be like
|
||||
'string'.
|
||||
"""
|
||||
|
||||
attr_string = attr_string.lower().strip()
|
||||
|
||||
if attr_string[:len('string')] == 'string':
|
||||
return cls(name)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
class DateAttribute(Attribute):
|
||||
|
||||
def __init__(self, name, date_format, datetime_unit):
|
||||
super().__init__(name)
|
||||
self.date_format = date_format
|
||||
self.datetime_unit = datetime_unit
|
||||
self.type_name = 'date'
|
||||
self.range = date_format
|
||||
self.dtype = np.datetime64(0, self.datetime_unit)
|
||||
|
||||
@staticmethod
|
||||
def _get_date_format(atrv):
|
||||
m = r_date.match(atrv)
|
||||
if m:
|
||||
pattern = m.group(1).strip()
|
||||
# convert time pattern from Java's SimpleDateFormat to C's format
|
||||
datetime_unit = None
|
||||
if "yyyy" in pattern:
|
||||
pattern = pattern.replace("yyyy", "%Y")
|
||||
datetime_unit = "Y"
|
||||
elif "yy":
|
||||
pattern = pattern.replace("yy", "%y")
|
||||
datetime_unit = "Y"
|
||||
if "MM" in pattern:
|
||||
pattern = pattern.replace("MM", "%m")
|
||||
datetime_unit = "M"
|
||||
if "dd" in pattern:
|
||||
pattern = pattern.replace("dd", "%d")
|
||||
datetime_unit = "D"
|
||||
if "HH" in pattern:
|
||||
pattern = pattern.replace("HH", "%H")
|
||||
datetime_unit = "h"
|
||||
if "mm" in pattern:
|
||||
pattern = pattern.replace("mm", "%M")
|
||||
datetime_unit = "m"
|
||||
if "ss" in pattern:
|
||||
pattern = pattern.replace("ss", "%S")
|
||||
datetime_unit = "s"
|
||||
if "z" in pattern or "Z" in pattern:
|
||||
raise ValueError("Date type attributes with time zone not "
|
||||
"supported, yet")
|
||||
|
||||
if datetime_unit is None:
|
||||
raise ValueError("Invalid or unsupported date format")
|
||||
|
||||
return pattern, datetime_unit
|
||||
else:
|
||||
raise ValueError("Invalid or no date format")
|
||||
|
||||
@classmethod
|
||||
def parse_attribute(cls, name, attr_string):
|
||||
"""
|
||||
Parse the attribute line if it knows how. Returns the parsed
|
||||
attribute, or None.
|
||||
|
||||
For date attributes, the attribute string would be like
|
||||
'date <format>'.
|
||||
"""
|
||||
|
||||
attr_string_lower = attr_string.lower().strip()
|
||||
|
||||
if attr_string_lower[:len('date')] == 'date':
|
||||
date_format, datetime_unit = cls._get_date_format(attr_string)
|
||||
return cls(name, date_format, datetime_unit)
|
||||
else:
|
||||
return None
|
||||
|
||||
def parse_data(self, data_str):
|
||||
"""
|
||||
Parse a value of this type.
|
||||
"""
|
||||
date_str = data_str.strip().strip("'").strip('"')
|
||||
if date_str == '?':
|
||||
return np.datetime64('NaT', self.datetime_unit)
|
||||
else:
|
||||
dt = datetime.datetime.strptime(date_str, self.date_format)
|
||||
return np.datetime64(dt).astype(
|
||||
"datetime64[%s]" % self.datetime_unit)
|
||||
|
||||
def __str__(self):
|
||||
return super(DateAttribute, self).__str__() + ',' + self.date_format
|
||||
|
||||
|
||||
class RelationalAttribute(Attribute):
|
||||
|
||||
def __init__(self, name):
|
||||
super().__init__(name)
|
||||
self.type_name = 'relational'
|
||||
self.dtype = np.object_
|
||||
self.attributes = []
|
||||
self.dialect = None
|
||||
|
||||
@classmethod
|
||||
def parse_attribute(cls, name, attr_string):
|
||||
"""
|
||||
Parse the attribute line if it knows how. Returns the parsed
|
||||
attribute, or None.
|
||||
|
||||
For date attributes, the attribute string would be like
|
||||
'date <format>'.
|
||||
"""
|
||||
|
||||
attr_string_lower = attr_string.lower().strip()
|
||||
|
||||
if attr_string_lower[:len('relational')] == 'relational':
|
||||
return cls(name)
|
||||
else:
|
||||
return None
|
||||
|
||||
def parse_data(self, data_str):
|
||||
# Copy-pasted
|
||||
elems = list(range(len(self.attributes)))
|
||||
|
||||
escaped_string = data_str.encode().decode("unicode-escape")
|
||||
|
||||
row_tuples = []
|
||||
|
||||
for raw in escaped_string.split("\n"):
|
||||
row, self.dialect = split_data_line(raw, self.dialect)
|
||||
|
||||
row_tuples.append(tuple(
|
||||
[self.attributes[i].parse_data(row[i]) for i in elems]))
|
||||
|
||||
return np.array(row_tuples,
|
||||
[(a.name, a.dtype) for a in self.attributes])
|
||||
|
||||
def __str__(self):
|
||||
return (super(RelationalAttribute, self).__str__() + '\n\t' +
|
||||
'\n\t'.join(str(a) for a in self.attributes))
|
||||
|
||||
|
||||
# -----------------
|
||||
# Various utilities
|
||||
# -----------------
|
||||
def to_attribute(name, attr_string):
|
||||
attr_classes = (NominalAttribute, NumericAttribute, DateAttribute,
|
||||
StringAttribute, RelationalAttribute)
|
||||
|
||||
for cls in attr_classes:
|
||||
attr = cls.parse_attribute(name, attr_string)
|
||||
if attr is not None:
|
||||
return attr
|
||||
|
||||
raise ParseArffError("unknown attribute %s" % attr_string)
|
||||
|
||||
|
||||
def csv_sniffer_has_bug_last_field():
|
||||
"""
|
||||
Checks if the bug https://bugs.python.org/issue30157 is unpatched.
|
||||
"""
|
||||
|
||||
# We only compute this once.
|
||||
has_bug = getattr(csv_sniffer_has_bug_last_field, "has_bug", None)
|
||||
|
||||
if has_bug is None:
|
||||
dialect = csv.Sniffer().sniff("3, 'a'")
|
||||
csv_sniffer_has_bug_last_field.has_bug = dialect.quotechar != "'"
|
||||
has_bug = csv_sniffer_has_bug_last_field.has_bug
|
||||
|
||||
return has_bug
|
||||
|
||||
|
||||
def workaround_csv_sniffer_bug_last_field(sniff_line, dialect, delimiters):
|
||||
"""
|
||||
Workaround for the bug https://bugs.python.org/issue30157 if is unpatched.
|
||||
"""
|
||||
if csv_sniffer_has_bug_last_field():
|
||||
# Reuses code from the csv module
|
||||
right_regex = r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'
|
||||
|
||||
for restr in (r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
|
||||
r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # .*?",
|
||||
right_regex, # ,".*?"
|
||||
r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
|
||||
regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
|
||||
matches = regexp.findall(sniff_line)
|
||||
if matches:
|
||||
break
|
||||
|
||||
# If it does not match the expression that was bugged, then this bug does not apply
|
||||
if restr != right_regex:
|
||||
return
|
||||
|
||||
groupindex = regexp.groupindex
|
||||
|
||||
# There is only one end of the string
|
||||
assert len(matches) == 1
|
||||
m = matches[0]
|
||||
|
||||
n = groupindex['quote'] - 1
|
||||
quote = m[n]
|
||||
|
||||
n = groupindex['delim'] - 1
|
||||
delim = m[n]
|
||||
|
||||
n = groupindex['space'] - 1
|
||||
space = bool(m[n])
|
||||
|
||||
dq_regexp = re.compile(
|
||||
r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" %
|
||||
{'delim': re.escape(delim), 'quote': quote}, re.MULTILINE
|
||||
)
|
||||
|
||||
doublequote = bool(dq_regexp.search(sniff_line))
|
||||
|
||||
dialect.quotechar = quote
|
||||
if delim in delimiters:
|
||||
dialect.delimiter = delim
|
||||
dialect.doublequote = doublequote
|
||||
dialect.skipinitialspace = space
|
||||
|
||||
|
||||
def split_data_line(line, dialect=None):
|
||||
delimiters = ",\t"
|
||||
|
||||
# This can not be done in a per reader basis, and relational fields
|
||||
# can be HUGE
|
||||
csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2))
|
||||
|
||||
# Remove the line end if any
|
||||
if line[-1] == '\n':
|
||||
line = line[:-1]
|
||||
|
||||
sniff_line = line
|
||||
|
||||
# Add a delimiter if none is present, so that the csv.Sniffer
|
||||
# does not complain for a single-field CSV.
|
||||
if not any(d in line for d in delimiters):
|
||||
sniff_line += ","
|
||||
|
||||
if dialect is None:
|
||||
dialect = csv.Sniffer().sniff(sniff_line, delimiters=delimiters)
|
||||
workaround_csv_sniffer_bug_last_field(sniff_line=sniff_line,
|
||||
dialect=dialect,
|
||||
delimiters=delimiters)
|
||||
|
||||
row = next(csv.reader([line], dialect))
|
||||
|
||||
return row, dialect
|
||||
|
||||
|
||||
# --------------
|
||||
# Parsing header
|
||||
# --------------
|
||||
def tokenize_attribute(iterable, attribute):
|
||||
"""Parse a raw string in header (e.g., starts by @attribute).
|
||||
|
||||
Given a raw string attribute, try to get the name and type of the
|
||||
attribute. Constraints:
|
||||
|
||||
* The first line must start with @attribute (case insensitive, and
|
||||
space like characters before @attribute are allowed)
|
||||
* Works also if the attribute is spread on multilines.
|
||||
* Works if empty lines or comments are in between
|
||||
|
||||
Parameters
|
||||
----------
|
||||
attribute : str
|
||||
the attribute string.
|
||||
|
||||
Returns
|
||||
-------
|
||||
name : str
|
||||
name of the attribute
|
||||
value : str
|
||||
value of the attribute
|
||||
next : str
|
||||
next line to be parsed
|
||||
|
||||
Examples
|
||||
--------
|
||||
If attribute is a string defined in python as r"floupi real", will
|
||||
return floupi as name, and real as value.
|
||||
|
||||
>>> iterable = iter([0] * 10) # dummy iterator
|
||||
>>> tokenize_attribute(iterable, r"@attribute floupi real")
|
||||
('floupi', 'real', 0)
|
||||
|
||||
If attribute is r"'floupi 2' real", will return 'floupi 2' as name,
|
||||
and real as value.
|
||||
|
||||
>>> tokenize_attribute(iterable, r" @attribute 'floupi 2' real ")
|
||||
('floupi 2', 'real', 0)
|
||||
|
||||
"""
|
||||
sattr = attribute.strip()
|
||||
mattr = r_attribute.match(sattr)
|
||||
if mattr:
|
||||
# atrv is everything after @attribute
|
||||
atrv = mattr.group(1)
|
||||
if r_comattrval.match(atrv):
|
||||
name, type = tokenize_single_comma(atrv)
|
||||
next_item = next(iterable)
|
||||
elif r_wcomattrval.match(atrv):
|
||||
name, type = tokenize_single_wcomma(atrv)
|
||||
next_item = next(iterable)
|
||||
else:
|
||||
# Not sure we should support this, as it does not seem supported by
|
||||
# weka.
|
||||
raise ValueError("multi line not supported yet")
|
||||
else:
|
||||
raise ValueError("First line unparsable: %s" % sattr)
|
||||
|
||||
attribute = to_attribute(name, type)
|
||||
|
||||
if type.lower() == 'relational':
|
||||
next_item = read_relational_attribute(iterable, attribute, next_item)
|
||||
# raise ValueError("relational attributes not supported yet")
|
||||
|
||||
return attribute, next_item
|
||||
|
||||
|
||||
def tokenize_single_comma(val):
|
||||
# XXX we match twice the same string (here and at the caller level). It is
|
||||
# stupid, but it is easier for now...
|
||||
m = r_comattrval.match(val)
|
||||
if m:
|
||||
try:
|
||||
name = m.group(1).strip()
|
||||
type = m.group(2).strip()
|
||||
except IndexError:
|
||||
raise ValueError("Error while tokenizing attribute")
|
||||
else:
|
||||
raise ValueError("Error while tokenizing single %s" % val)
|
||||
return name, type
|
||||
|
||||
|
||||
def tokenize_single_wcomma(val):
|
||||
# XXX we match twice the same string (here and at the caller level). It is
|
||||
# stupid, but it is easier for now...
|
||||
m = r_wcomattrval.match(val)
|
||||
if m:
|
||||
try:
|
||||
name = m.group(1).strip()
|
||||
type = m.group(2).strip()
|
||||
except IndexError:
|
||||
raise ValueError("Error while tokenizing attribute")
|
||||
else:
|
||||
raise ValueError("Error while tokenizing single %s" % val)
|
||||
return name, type
|
||||
|
||||
|
||||
def read_relational_attribute(ofile, relational_attribute, i):
|
||||
"""Read the nested attributes of a relational attribute"""
|
||||
|
||||
r_end_relational = re.compile(r'^@[Ee][Nn][Dd]\s*' +
|
||||
relational_attribute.name + r'\s*$')
|
||||
|
||||
while not r_end_relational.match(i):
|
||||
m = r_headerline.match(i)
|
||||
if m:
|
||||
isattr = r_attribute.match(i)
|
||||
if isattr:
|
||||
attr, i = tokenize_attribute(ofile, i)
|
||||
relational_attribute.attributes.append(attr)
|
||||
else:
|
||||
raise ValueError("Error parsing line %s" % i)
|
||||
else:
|
||||
i = next(ofile)
|
||||
|
||||
i = next(ofile)
|
||||
return i
|
||||
|
||||
|
||||
def read_header(ofile):
|
||||
"""Read the header of the iterable ofile."""
|
||||
i = next(ofile)
|
||||
|
||||
# Pass first comments
|
||||
while r_comment.match(i):
|
||||
i = next(ofile)
|
||||
|
||||
# Header is everything up to DATA attribute ?
|
||||
relation = None
|
||||
attributes = []
|
||||
while not r_datameta.match(i):
|
||||
m = r_headerline.match(i)
|
||||
if m:
|
||||
isattr = r_attribute.match(i)
|
||||
if isattr:
|
||||
attr, i = tokenize_attribute(ofile, i)
|
||||
attributes.append(attr)
|
||||
else:
|
||||
isrel = r_relation.match(i)
|
||||
if isrel:
|
||||
relation = isrel.group(1)
|
||||
else:
|
||||
raise ValueError("Error parsing line %s" % i)
|
||||
i = next(ofile)
|
||||
else:
|
||||
i = next(ofile)
|
||||
|
||||
return relation, attributes
|
||||
|
||||
|
||||
class MetaData(object):
|
||||
"""Small container to keep useful information on a ARFF dataset.
|
||||
|
||||
Knows about attributes names and types.
|
||||
|
||||
Examples
|
||||
--------
|
||||
::
|
||||
|
||||
data, meta = loadarff('iris.arff')
|
||||
# This will print the attributes names of the iris.arff dataset
|
||||
for i in meta:
|
||||
print(i)
|
||||
# This works too
|
||||
meta.names()
|
||||
# Getting attribute type
|
||||
types = meta.types()
|
||||
|
||||
Methods
|
||||
-------
|
||||
names
|
||||
types
|
||||
|
||||
Notes
|
||||
-----
|
||||
Also maintains the list of attributes in order, i.e., doing for i in
|
||||
meta, where meta is an instance of MetaData, will return the
|
||||
different attribute names in the order they were defined.
|
||||
"""
|
||||
def __init__(self, rel, attr):
|
||||
self.name = rel
|
||||
|
||||
# We need the dictionary to be ordered
|
||||
self._attributes = OrderedDict((a.name, a) for a in attr)
|
||||
|
||||
def __repr__(self):
|
||||
msg = ""
|
||||
msg += "Dataset: %s\n" % self.name
|
||||
for i in self._attributes:
|
||||
msg += "\t%s's type is %s" % (i, self._attributes[i].type_name)
|
||||
if self._attributes[i].range:
|
||||
msg += ", range is %s" % str(self._attributes[i].range)
|
||||
msg += '\n'
|
||||
return msg
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self._attributes)
|
||||
|
||||
def __getitem__(self, key):
|
||||
attr = self._attributes[key]
|
||||
|
||||
return (attr.type_name, attr.range)
|
||||
|
||||
def names(self):
|
||||
"""Return the list of attribute names.
|
||||
|
||||
Returns
|
||||
-------
|
||||
attrnames : list of str
|
||||
The attribute names.
|
||||
"""
|
||||
return list(self._attributes)
|
||||
|
||||
def types(self):
|
||||
"""Return the list of attribute types.
|
||||
|
||||
Returns
|
||||
-------
|
||||
attr_types : list of str
|
||||
The attribute types.
|
||||
"""
|
||||
attr_types = [self._attributes[name].type_name
|
||||
for name in self._attributes]
|
||||
return attr_types
|
||||
|
||||
|
||||
def loadarff(f):
|
||||
"""
|
||||
Read an arff file.
|
||||
|
||||
The data is returned as a record array, which can be accessed much like
|
||||
a dictionary of NumPy arrays. For example, if one of the attributes is
|
||||
called 'pressure', then its first 10 data points can be accessed from the
|
||||
``data`` record array like so: ``data['pressure'][0:10]``
|
||||
|
||||
|
||||
Parameters
|
||||
----------
|
||||
f : file-like or str
|
||||
File-like object to read from, or filename to open.
|
||||
|
||||
Returns
|
||||
-------
|
||||
data : record array
|
||||
The data of the arff file, accessible by attribute names.
|
||||
meta : `MetaData`
|
||||
Contains information about the arff file such as name and
|
||||
type of attributes, the relation (name of the dataset), etc.
|
||||
|
||||
Raises
|
||||
------
|
||||
ParseArffError
|
||||
This is raised if the given file is not ARFF-formatted.
|
||||
NotImplementedError
|
||||
The ARFF file has an attribute which is not supported yet.
|
||||
|
||||
Notes
|
||||
-----
|
||||
|
||||
This function should be able to read most arff files. Not
|
||||
implemented functionality include:
|
||||
|
||||
* date type attributes
|
||||
* string type attributes
|
||||
|
||||
It can read files with numeric and nominal attributes. It cannot read
|
||||
files with sparse data ({} in the file). However, this function can
|
||||
read files with missing data (? in the file), representing the data
|
||||
points as NaNs.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.io import arff
|
||||
>>> from io import StringIO
|
||||
>>> content = \"\"\"
|
||||
... @relation foo
|
||||
... @attribute width numeric
|
||||
... @attribute height numeric
|
||||
... @attribute color {red,green,blue,yellow,black}
|
||||
... @data
|
||||
... 5.0,3.25,blue
|
||||
... 4.5,3.75,green
|
||||
... 3.0,4.00,red
|
||||
... \"\"\"
|
||||
>>> f = StringIO(content)
|
||||
>>> data, meta = arff.loadarff(f)
|
||||
>>> data
|
||||
array([(5.0, 3.25, 'blue'), (4.5, 3.75, 'green'), (3.0, 4.0, 'red')],
|
||||
dtype=[('width', '<f8'), ('height', '<f8'), ('color', '|S6')])
|
||||
>>> meta
|
||||
Dataset: foo
|
||||
\twidth's type is numeric
|
||||
\theight's type is numeric
|
||||
\tcolor's type is nominal, range is ('red', 'green', 'blue', 'yellow', 'black')
|
||||
|
||||
"""
|
||||
if hasattr(f, 'read'):
|
||||
ofile = f
|
||||
else:
|
||||
ofile = open(f, 'rt')
|
||||
try:
|
||||
return _loadarff(ofile)
|
||||
finally:
|
||||
if ofile is not f: # only close what we opened
|
||||
ofile.close()
|
||||
|
||||
|
||||
def _loadarff(ofile):
|
||||
# Parse the header file
|
||||
try:
|
||||
rel, attr = read_header(ofile)
|
||||
except ValueError as e:
|
||||
msg = "Error while parsing header, error was: " + str(e)
|
||||
raise ParseArffError(msg)
|
||||
|
||||
# Check whether we have a string attribute (not supported yet)
|
||||
hasstr = False
|
||||
for a in attr:
|
||||
if isinstance(a, StringAttribute):
|
||||
hasstr = True
|
||||
|
||||
meta = MetaData(rel, attr)
|
||||
|
||||
# XXX The following code is not great
|
||||
# Build the type descriptor descr and the list of convertors to convert
|
||||
# each attribute to the suitable type (which should match the one in
|
||||
# descr).
|
||||
|
||||
# This can be used once we want to support integer as integer values and
|
||||
# not as numeric anymore (using masked arrays ?).
|
||||
|
||||
if hasstr:
|
||||
# How to support string efficiently ? Ideally, we should know the max
|
||||
# size of the string before allocating the numpy array.
|
||||
raise NotImplementedError("String attributes not supported yet, sorry")
|
||||
|
||||
ni = len(attr)
|
||||
|
||||
def generator(row_iter, delim=','):
|
||||
# TODO: this is where we are spending time (~80%). I think things
|
||||
# could be made more efficiently:
|
||||
# - We could for example "compile" the function, because some values
|
||||
# do not change here.
|
||||
# - The function to convert a line to dtyped values could also be
|
||||
# generated on the fly from a string and be executed instead of
|
||||
# looping.
|
||||
# - The regex are overkill: for comments, checking that a line starts
|
||||
# by % should be enough and faster, and for empty lines, same thing
|
||||
# --> this does not seem to change anything.
|
||||
|
||||
# 'compiling' the range since it does not change
|
||||
# Note, I have already tried zipping the converters and
|
||||
# row elements and got slightly worse performance.
|
||||
elems = list(range(ni))
|
||||
|
||||
dialect = None
|
||||
for raw in row_iter:
|
||||
# We do not abstract skipping comments and empty lines for
|
||||
# performance reasons.
|
||||
if r_comment.match(raw) or r_empty.match(raw):
|
||||
continue
|
||||
|
||||
row, dialect = split_data_line(raw, dialect)
|
||||
|
||||
yield tuple([attr[i].parse_data(row[i]) for i in elems])
|
||||
|
||||
a = list(generator(ofile))
|
||||
# No error should happen here: it is a bug otherwise
|
||||
data = np.array(a, [(a.name, a.dtype) for a in attr])
|
||||
return data, meta
|
||||
|
||||
|
||||
# ----
|
||||
# Misc
|
||||
# ----
|
||||
def basic_stats(data):
|
||||
nbfac = data.size * 1. / (data.size - 1)
|
||||
return np.nanmin(data), np.nanmax(data), np.mean(data), np.std(data) * nbfac
|
||||
|
||||
|
||||
def print_attribute(name, tp, data):
|
||||
type = tp.type_name
|
||||
if type == 'numeric' or type == 'real' or type == 'integer':
|
||||
min, max, mean, std = basic_stats(data)
|
||||
print("%s,%s,%f,%f,%f,%f" % (name, type, min, max, mean, std))
|
||||
else:
|
||||
print(str(tp))
|
||||
|
||||
|
||||
def test_weka(filename):
|
||||
data, meta = loadarff(filename)
|
||||
print(len(data.dtype))
|
||||
print(data.size)
|
||||
for i in meta:
|
||||
print_attribute(i, meta[i], data[i])
|
||||
|
||||
|
||||
# make sure nose does not find this as a test
|
||||
test_weka.__test__ = False
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
filename = sys.argv[1]
|
||||
test_weka(filename)
|
Loading…
Add table
Add a link
Reference in a new issue