www.gusucode.com > datastoreio工具箱 matlab源码程序 > datastoreio/+matlab/+io/+datastore/@TabularTextDatastore/introspectFile.m

    function introspectFile(ds)
%INTROSPECTFILE reads variable names and format information from text file.
%   This function is responsible for detecting the variable name and format
%   informationusing readVarFormat and using them to set the active
%   variable names and active formats.

%   Copyright 2014-2016 The MathWorks, Inc.

    % imports
    import matlab.io.datastore.internal.makeValidVars;
    import matlab.internal.tableUtils.warningWithoutTrace;

    % if no splits, or empty splits, return early
    if isempty(ds.Splitter.Splits) || all([ds.Splitter.Splits.Size] == 0)
        return;
    end

    % variable name and format information as passed by the user during
    % datastore construction. These are always cellstrs.
    inStruct = ds.PrivateVarFormatStruct;
    
    % detect variable names and formats and the ones that are skipped.
    % varNames and formatCell come out as cellstrs always
    [varNames, formatCell, skippedVec] = readVarFormat(ds, inStruct);
    
    % local vars
    allVarNames = inStruct.VariableNames;
    sVarNames = inStruct.SelectedVariableNames;
    sFormats = inStruct.SelectedFormats;
    
    % when both ReadVariableNames is true and VariableNames are provided
    % the detected VariableNames are overwritten by the specified ones
    % after issuing a warning message.
    if ~isempty(allVarNames)
    
        % ReadVariableNames is true by default
        if ds.ReadVariableNames
            warningWithoutTrace(message('MATLAB:datastoreio:tabulartextdatastore:replaceReadVariableNames'));
        end
        
        % number of VariableNames and TextscanFormats must match
        if numel(allVarNames) ~= numel(formatCell)
            error(message('MATLAB:datastoreio:tabulartextdatastore:varFormatMismatch', ...
                                      'VariableNames', 'TextscanFormats'));
        end
        
        varNames = allVarNames;
    end
    
    % make valid variable names
    validVarNames = makeValidVars(varNames, 'warn');
    
    % this is explicitly done so that the numel check happens in the actual
    % setters of VariableNames and TextscanFormats and that they are in the
    % right state
    ds.PrivateVariableNames = validVarNames;
    ds.PrivateTextscanFormats = formatCell;
    
    % final sets
    ds.VariableNames = validVarNames;
    ds.TextscanFormats = formatCell;
    
    % handle skips in formats with SelectedVariableNames and
    % SelectedFormats
    if any(skippedVec)
        
        % SelectedVariableNames cannot be specified when there are skips in
        % the format
        if ~isempty(sVarNames)
            error(message('MATLAB:datastoreio:tabulartextdatastore:invalidActiveSkip', ...
                                                 'SelectedVariableNames'));
        end
        
        % SelectedFormats cannot be specified when there are skips in the
        % format
        if ~isempty(sFormats)
            error(message('MATLAB:datastoreio:tabulartextdatastore:invalidActiveSkip', ...
                                                       'SelectedFormats'));
        end
        
        % set the active variable names
        ds.SelectedVariableNames = validVarNames(~skippedVec);
    else
        if isempty(sVarNames)
            if ~isempty(sFormats)
                % SelectedFormats cannot be specified without
                % SelectedVariableNames
                error(message('MATLAB:datastoreio:tabulartextdatastore:invalidActiveFormats'));
            end
        else
            % make the SelectedVariableNames valid
            ds.SelectedVariableNames = makeValidVars(sVarNames, 'warn');
            
            if ~isempty(sFormats)
                ds.SelectedFormats = sFormats;
            end
        end
    end
end

function [varNames, outFormatCell, skippedVec] = readVarFormat(ds, inStruct)
%readvarFormat detects variable names and format information from text file
%   This function is responsible for detecting the variable name and format
%   information from the text file based on the value of ReadVariableNames
%   and whether TextscanFormats are provided during datastore construction.
%   It ensures that the TextscanFormats detected or specified are valid and
%   match the number of variable names detected or specified.

    % imports
    import matlab.internal.table.dfltVarNames;
    import matlab.io.datastore.TabularTextDatastore;
    import matlab.io.datastore.internal.filesys.createStream;
    import matlab.io.internal.text.determineVarNames;
    import matlab.io.internal.text.determineFormatString;    
    import matlab.io.internal.text.detectVariableNames;
    import matlab.io.internal.text.detectParametersFromFileOrStr;

    % currently we use the first file to detect variable names and formats
    file = ds.Files{1};

    % open the first file as a read only stream
    try
        stream = createStream(file, 'rt', ds.FileEncoding);
    catch
        error(message('MATLAB:datastoreio:tabulartextdatastore:unableToOpenFile', file));
    end

    % close the stream on exit.
    cleanup = onCleanup(@() close(stream));

    % Our lower level API's use upto 4 MB to detect Delimiter,
    % NumHeaderlines and MultipleDelimitersAsOne
    delimSupplied = ds.PrivateDelimiterSupplied;
    headerSupplied = ds.PrivateNumHeaderLinesSupplied;
    % empty delimiters are not handled by detectParametersFromFileOrStr
    if ~isempty(ds.Delimiter) && ( ~delimSupplied || ~headerSupplied )
        dataForDetection = readTextBytes(stream, TabularTextDatastore.DEFAULT_DETECTION_SIZE);
        stream.seek(0); % reset the stream for later use.
        args = ds.getTextscanArgs();
        fmt = ds.PrivateVarFormatStruct.TextscanFormats;
        if ~isempty(fmt)
            fmt_str = matlab.iofun.internal.formatParser([fmt{:}]);
            n = nnz(~fmt_str.IsLiteral);
            args(end+1:end+2) = {'NumVariables',n};
        end
        [delim,header,multipleDelimsAsOne] = detectParametersFromFileOrStr(dataForDetection,delimSupplied,headerSupplied,ds.Delimiter,ds.NumHeaderLines,false,args);
        if ~ds.PrivateMultipleDelimitersAsOneSupplied
            ds.PrivateMultipleDelimitersAsOne = multipleDelimsAsOne;
        end
        if ~delimSupplied
            % Detected delimiter might have a whitespace
            % character, if so, remove it from the whitespace
            % parameter to avoid warning

            if iscell(delim) && numel(delim) == 1
                delim = delim{1};
            end
            % isWhitespaceUsingDefault argument to handleDelimWhitespaceConflicts
            % method is 'true' here, because we detect the delimiter
            % and we need to handle the conflict without any warnings.
            [delim, ws] = TabularTextDatastore.handleDelimWhitespaceConflicts(...
                delim, ds.PrivateWhitespace, true);
            ds.PrivateWhitespace = ws;
            ds.PrivateDelimiter = delim;
        end
        if ~headerSupplied
            ds.PrivateNumHeaderLines = header;
        end
    end
    
    % escape the delimiter before using
    delim = ds.Delimiter;
    if ischar(delim)
        delim = sprintf(delim);
    else
        delim = cellfun(@(x) sprintf(x), delim, 'UniformOutput', false);    
    end
    
    % setting up textscan arguments used for introspection, this does not
    % contain delimiter, whitespace, treatAsMissing and headerlines as they
    % are passed separately. Also MissingValue, ReturnOnError are not
    % passed as they do not affect the VariableNames and Formats. Currently
    % ExponentCharacters cannot be detected as a numeric format.
    cStyle = ds.CommentStyle;
    txtScanArgsforIntroSpection = {'CommentStyle', cStyle, ...
                        'MultipleDelimsAsOne', ds.MultipleDelimitersAsOne};
    
    % index in the data
    strIdx = 0;
    
    % local variables
    hdrLines = ds.NumHeaderLines;
    readVarNames = ds.ReadVariableNames;
    rowDelim = ds.RowDelimiter;
    whiteSpace = ds.Whitespace;
    varFormatData = [];
    
    % block of code which detects VariableNames from the file.
    if readVarNames || ~ds.PrivateReadVariableNamesSupplied
        % buffer atleast 1 row of information (not including the header
        % lines and the comment lines) which includes the variable names
        varFormatData =  ...
            bufferDataFromFile(stream, file, rowDelim, cStyle, hdrLines, ...
                                        whiteSpace, 'VariableNames', true);

        % Read in the first line of var names as a single string, skipping
        % any leading blank lines and header lines. This call handles
        % non-default row delimiters like : for example ignoring delimiter
        % and whitespace. This call also accepts CommentStyle as we want to
        % skip comment lines. Also consume the eor as we do not want to
        % account for it when we reuse varFormatData.
        [raw,strIdx] = textscan(varFormatData, ['%s%[' rowDelim ']'], 1, ...
                                'Delimiter', '', 'Whitespace', whiteSpace, ...
                                'Headerlines', hdrLines, 'EndOfLine', ...
                                rowDelim, txtScanArgsforIntroSpection{:});
        hdrLines = 0; % just skipped them
        if isempty(raw{1}) || isempty(raw{1}{1})
            error(message('MATLAB:datastoreio:tabulartextdatastore:varFormatDetectionFailure', ...
                file, 'VariableNames', 'VariableNames'));
        else
            vnline = raw{1}{1};
        end
    end
    
    % local variables    
    treatAsMissing = ds.TreatAsMissing;
    inFormatCell = inStruct.TextscanFormats;
    
    % block of code that detect formats. If ReadVariableNames was false we
    % buffer data. Otherwise we resize the data. We check if it is empty in
    % which case we buffer data from file, otherwise we try to check if the
    % nonempty data ends in a row delimiter. We buffer more if it does not
    % (to check against truncated data) otherwise use the same data to
    % detect formats.
    if isempty(inFormatCell)
        % if ReadVariableNames is false, buffer data from file.
        if isempty(varFormatData)
            varFormatData =  bufferDataFromFile(stream, file, rowDelim, ...
                                cStyle, hdrLines, whiteSpace, 'TextscanFormats', true);
        else
            % resize the data
            varFormatData = varFormatData(strIdx+1:end);
            
            % if there is no more data, then ask for more
            if isempty(varFormatData)
                varFormatData =  bufferDataFromFile(stream, file, ...
                      rowDelim, cStyle, hdrLines, whiteSpace, 'TextscanFormats', true);
            else
                % ensure data ends at a row delimiter
                delimAtEndOfData = findRowDelim(varFormatData, rowDelim, cStyle, hdrLines, whiteSpace);
                
                % request more data if the data does not end at a row delimiter
                if ~delimAtEndOfData
                    varFormatData = [varFormatData bufferDataFromFile(stream, ...
                                 file,  rowDelim, cStyle, hdrLines, whiteSpace, 'TextscanFormats', false)];
                end
            end
        end
        
        % Guess a format string for the dataline by reading it as a single
        % string, skipping any leading blank lines. This call handles
        % non-default row delimiters like (':') for example, ignoring
        % delimiter and whitespace. This call also accepts CommentStyle as
        % we want to skip comment lines.
        raw = textscan(varFormatData, '%s', 1, 'Delimiter', '', ...
                       'Whitespace', whiteSpace, 'Headerlines', hdrLines, ...
                       'EndOfLine', rowDelim, txtScanArgsforIntroSpection{:});
        if isempty(raw{1}) || isempty(raw{1}{1})
            error(message('MATLAB:datastoreio:tabulartextdatastore:varFormatDetectionFailure', ...
                              file, 'TextscanFormats', 'TextscanFormats'));
        else
            % determine the format string from the first line
            formatStr = determineFormatString(raw{1}{1}, delim, whiteSpace, ...
                              rowDelim, treatAsMissing, txtScanArgsforIntroSpection);
            % convert to a struct
            fStruct = matlab.iofun.internal.formatParser(formatStr);
            outFormatCell = fStruct.Format;
            skippedVec = zeros(1,numel(outFormatCell));
        end
    else
        % user specified formats are always wrapped in a cell
        [outFormatCell, skippedVec] = ...
            TabularTextDatastore.concatLiteral(inFormatCell, ds.TextscanFormatsAsCellStr);
    end
    
    % detect ReadVariableNames
    fmtstr = [outFormatCell{:}];
    if ~ds.PrivateReadVariableNamesSupplied && ~matlab.io.internal.text.fomatIsAllString(fmtstr)
        ds.PrivateReadVariableNames = detectVariableNames(fmtstr,vnline,ds.Delimiter,ds.Whitespace,ds.RowDelimiter,txtScanArgsforIntroSpection);
        readVarNames = ds.PrivateReadVariableNames;
    end

    % setup VariableNames
    if readVarNames
        varNames = determineVarNames(vnline, strjoin(outFormatCell), delim, ...
                           whiteSpace, rowDelim, false, txtScanArgsforIntroSpection);
        if numel(varNames) ~= numel(outFormatCell)
            error(message('MATLAB:readtable:ReadVarNamesFailed',file, ...
                                    numel(outFormatCell),numel(varNames)));
        end
    else
        % defaults are {'Var1', 'Var2', ...}
        varNames = dfltVarNames((1:numel(outFormatCell)));
    end
end

function varFormatData =  bufferDataFromFile(stream, file, rowDelim, cStyle, hdrLines, whiteSpace, propName, throwErrorIfNoData)
%BUFFERDATAFROMFILE buffers data from file.
%   This function is responsible to gurantee returning atleast 1 row of
%   data (variable name line or data line, not incuding comment lines and
%   header lines). The upper bound on the how much data we buffer to detect
%   variable names and fomat information is 32 MB which is also the split
%   size. We error graciously when we cannot find the row of information.

    % imports
    import matlab.io.datastore.TabularTextDatastore;
    
    % read data from file
    [tmpData, tmpBytesRead] = readTextBytes(stream, TabularTextDatastore.DEFAULT_PEEK_SIZE);
    
    % error early if no data is available
    if isempty(tmpData)
        if throwErrorIfNoData
            error(message('MATLAB:datastoreio:tabulartextdatastore:varFormatDetectionFailure', ...
                                                file, propName, propName));
        else
            varFormatData = [];
            return
        end
    end
    
    % see if rowDelim exists at the end of the first variable or data line
    varFormatData = tmpData;
    bytesRead = tmpBytesRead;
    delimAtEndOfData = findRowDelim(varFormatData, rowDelim, cStyle, hdrLines, whiteSpace);
    
    % buffer data if a rowDelim was not found in the initial peek size
    while (bytesRead <= TabularTextDatastore.BUFFER_UPPERLIMIT) && ~delimAtEndOfData
        
        % read data from file
        [tmpData, tmpBytesRead] = readTextBytes(stream, TabularTextDatastore.DEFAULT_PEEK_SIZE);
        
        % this might just be data with no eor at the end.
        if isempty(tmpData)
            break;
        end
        
        % accumulate data before calling textscan, ensured that it is
        % non-empty
        varFormatData = [varFormatData tmpData];
        bytesRead = bytesRead + tmpBytesRead;
        
        % see if rowDelim exists at the end of the first variable or data line
        delimAtEndOfData = findRowDelim(varFormatData, rowDelim, cStyle, hdrLines, whiteSpace);
    end
    
    % error if we go beyond 32MB
    if (bytesRead > TabularTextDatastore.BUFFER_UPPERLIMIT) && ~delimAtEndOfData                  
        error(message('MATLAB:datastoreio:tabulartextdatastore:varFormatDetectionFailure', ...
                                                file, propName, propName));
    end
end

function tf = findRowDelim(varFormatData, rowDelim, cStyle, hdrLines, whiteSpace)
%FINDROWDELIM finds a row delimiter at the end of a data line.
%   This function is responsible to find a row delimiter at the end of a 
%   data line and return true/false based on whether it succeeded. For 
%   '\r\n' it looks for either a \r or a \n or both.

    % at this stage rowDelim is always unescaped
    if strcmp(rowDelim, '\r\n')
        tf  = checkForRowDelimAfterData('\n') || checkForRowDelimAfterData('\r');
    else
        tf = checkForRowDelimAfterData(rowDelim);
    end

        % nested function to return true when row delimiter is found. This
        % skips all the leading blank lines, header lines, comment lines
        % and lines with only whitespace characters (empty lines)
        function tf = checkForRowDelimAfterData(rowDelim)
            tf = true;    
            delimAfterData = textscan(varFormatData, ['%*s%[' rowDelim ']'], 1, ...
                                     'Delimiter', '', 'Whitespace', whiteSpace, ...
                                     'Headerlines', hdrLines, 'EndOfLine', ...
                                         rowDelim, 'CommentStyle', cStyle);
            if (isempty(delimAfterData{1}) || isempty(delimAfterData{1}{1}))
                tf = false;
            end
        end
end