Skip to content

Commit

Permalink
Always output array data as a pandas object
Browse files Browse the repository at this point in the history
  • Loading branch information
eyurtsev committed Feb 19, 2018
1 parent 366bdc7 commit ee6ec07
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 45 deletions.
44 changes: 18 additions & 26 deletions fcsparser/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,9 @@
import warnings

import numpy
import pandas as pd
import six

try:
import pandas as pd
except ImportError:
pd = None
warnings.warn(u'pandas is not installed, so the parse_fcs function can only be used together '
u'with numpy.')

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -484,8 +478,8 @@ def dataframe(self):
return pd.DataFrame(data, columns=channel_names)


def parse(path, meta_data_only=False, output_format='DataFrame', compensate=False,
channel_naming='$PnS', reformat_meta=False, data_set=0):
def parse(path, meta_data_only=False, compensate=False, channel_naming='$PnS',
reformat_meta=False, data_set=0, dtype='float32'):
"""Parse an fcs file at the location specified by the path.
Parameters
Expand Down Expand Up @@ -514,7 +508,13 @@ def parse(path, meta_data_only=False, output_format='DataFrame', compensate=Fals
into a DataFrame and moved into the '_channels_' key
data_set: int
Index of retrieved data set in the fcs file.
This value specifies the data set being retrieved from an fcs file with multple data sets.
This value specifies the data set being retrieved from an fcs file with multiple data sets.
dtype: str | None
If provided, will force convert all data into this dtype.
This is set by default to auto-convert to float32 to deal with cases in which the original
data has been stored using a smaller data type (e.g., unit8). This modifies the original
data, but should make follow up analysis safer in basically all cases.
Returns
-------
Expand All @@ -530,32 +530,24 @@ def parse(path, meta_data_only=False, output_format='DataFrame', compensate=Fals
--------
fname = '../tests/data/EY_2013-05-03_EID_214_PID_1120_Piperacillin_Well_B7.001.fcs'
meta = parse_fcs(fname, meta_data_only=True)
meta, data_pandas = parse_fcs(fname, meta_data_only=False, output_format='DataFrame')
meta, data_numpy = parse_fcs(fname, meta_data_only=False, output_format='ndarray')
meta, data_pandas = parse_fcs(fname, meta_data_only=False)
"""
if compensate:
raise ParserFeatureNotImplementedError(u'Compensation has not been implemented yet.')

if reformat_meta or (output_format == 'DataFrame'):
if pd is None:
raise ImportError(u'You do not have pandas installed.')

read_data = not meta_data_only

parsed_fcs = FCSParser(path, read_data=read_data, channel_naming=channel_naming,
fcs_parser = FCSParser(path, read_data=read_data, channel_naming=channel_naming,
data_set=data_set)

if reformat_meta:
parsed_fcs.reformat_meta()
fcs_parser.reformat_meta()

meta = parsed_fcs.annotation
meta = fcs_parser.annotation

if meta_data_only:
return meta
elif output_format == 'DataFrame':
return meta, parsed_fcs.dataframe
elif output_format == 'ndarray':
# Constructs numpy matrix
return meta, parsed_fcs.data
else:
raise ValueError(u'The output_format must be either "ndarray" or "DataFrame".')
else: # Then include both meta and dataframe.
df = fcs_parser.dataframe
df = df.astype(dtype) if dtype else df
return meta, df
30 changes: 11 additions & 19 deletions fcsparser/tests/test_fcs_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,24 +43,24 @@
def check_data_segment(file_name, expected_array_values):
"""Check that the data segmented extracted from the file corresponds to the expected values."""
file_path = FILE_IDENTIFIER_TO_PATH[file_name]
meta, matrix = parse_fcs(file_path, output_format='ndarray')
diff = numpy.abs(expected_array_values - matrix[0:4, :])
meta, df = parse_fcs(file_path)
diff = numpy.abs(expected_array_values - df.values[0:4, :])
return numpy.all(diff < 10 ** -8) # Is this the proper way to do the test?


class TestFCSReader(unittest.TestCase):
def test_mq_FCS_2_0_text_segment(self):
"""Test TEXT segment parsed from FCS (2.0 format) file from a MACSQuant flow cytometer."""
fname = FILE_IDENTIFIER_TO_PATH['mq fcs 2.0']
meta = parse_fcs(fname, meta_data_only=True, output_format='ndarray')
meta = parse_fcs(fname, meta_data_only=True)
self.assertEqual('EY_2013-07-19_PBS_FCS_2.0_Custom_Without_Add_Well_A1.001.fcs',
meta['$FIL'])
self.assertEqual('MACSQuant', meta['$CYT'])

def test_mq_FCS_3_0_text_segment(self):
"""Test TEXT segment parsed from FCS (3.0 format) file from a MACSQuant flow cytometer."""
fname = FILE_IDENTIFIER_TO_PATH['mq fcs 3.0']
meta = parse_fcs(fname, meta_data_only=True, output_format='ndarray')
meta = parse_fcs(fname, meta_data_only=True)

expected_fname = 'EY_2013-07-19_PID_101_MG1655_Transformants_D01_Well_A4.001.fcs'
self.assertEqual(expected_fname, meta['$FIL'])
Expand All @@ -69,7 +69,7 @@ def test_mq_FCS_3_0_text_segment(self):
def test_mq_FCS_3_1_text_segment(self):
"""Test TEXT segment parsed from FCS (3.1 format) file from a MACSQuant flow cytometer."""
fname = FILE_IDENTIFIER_TO_PATH['mq fcs 3.1']
meta = parse_fcs(fname, meta_data_only=True, output_format='ndarray')
meta = parse_fcs(fname, meta_data_only=True)
self.assertEqual('MACSQuant', meta['$CYT'])

def test_mq_FCS_2_0_data_segment(self):
Expand Down Expand Up @@ -205,7 +205,7 @@ def test_Fortessa_data_segment(self):
def test_mq_FCS_3_1_data_segment(self):
"""Test DATA segment parsed from FCS (3.1 format) file from a MACSQuant flow cytometer"""
fname = FILE_IDENTIFIER_TO_PATH['mq fcs 3.1']
meta, matrix = parse_fcs(fname, output_format='ndarray')
meta, df = parse_fcs(fname)

def test_fcs_reader_API(self):
"""Make sure that the API remains consistent."""
Expand All @@ -214,13 +214,8 @@ def test_fcs_reader_API(self):
# Invoke the parser in multiple ways to make sure that all invocations run successfully.
# This is a shallow test that only verifies consistency.
meta = parse_fcs(fname, meta_data_only=True)
meta, data_pandas = parse_fcs(fname, meta_data_only=False, output_format='DataFrame')
meta, data_pandas = parse_fcs(fname, meta_data_only=False, output_format='DataFrame',
reformat_meta=True)
meta, data_numpy = parse_fcs(fname, meta_data_only=False, output_format='ndarray',
reformat_meta=False)
meta, data_numpy = parse_fcs(fname, meta_data_only=False, output_format='ndarray',
reformat_meta=True)
meta, data_pandas = parse_fcs(fname, meta_data_only=False)
meta, data_pandas = parse_fcs(fname, meta_data_only=False, reformat_meta=True)
self.assertIsInstance(meta['_channel_names_'], tuple)
self.assertGreater(len(meta['_channel_names_']), 0)

Expand Down Expand Up @@ -278,21 +273,18 @@ def test_speed_of_reading_fcs_files(self):
number = 1000

time = timeit.timeit(
lambda: parse_fcs(file_path, meta_data_only=True, output_format='DataFrame',
reformat_meta=False), number=number)
lambda: parse_fcs(file_path, meta_data_only=True, reformat_meta=False), number=number)

print('Loading fcs file {0} times with meta_data only without reformatting of '
'meta takes {1} per loop'.format(time / number, number))

time = timeit.timeit(
lambda: parse_fcs(file_path, meta_data_only=True, output_format='DataFrame',
reformat_meta=True), number=number)
lambda: parse_fcs(file_path, meta_data_only=True, reformat_meta=True), number=number)
print('Loading fcs file {0} times with meta_data only with reformatting of '
'meta takes {1} per loop'.format(time / number, number))

time = timeit.timeit(
lambda: parse_fcs(file_path, meta_data_only=False, output_format='DataFrame',
reformat_meta=False), number=number)
lambda: parse_fcs(file_path, meta_data_only=False, reformat_meta=False), number=number)

print('Loading fcs file {0} times both meta and data but without reformatting of '
'meta takes {1} per loop'.format(time / number, number))
Expand Down

0 comments on commit ee6ec07

Please sign in to comment.