www.gusucode.com > datastoreio工具箱 matlab源码程序 > datastoreio/+matlab/+io/+datastore/+splitter/TextFileSplitter.m
classdef TextFileSplitter < matlab.io.datastore.splitter.FileSizeBasedSplitter %TEXTFILESPLITTER Class for creating splits from text files. % Copyright 2015 The MathWorks, Inc. properties (Dependent) % FileEncoding to use for the stream. FileEncoding; end properties % End of record (line) to use for reading correctly. EOR = []; end properties (Access = 'private') PrivateFileEncoding = 'UTF-8'; end properties (Constant, Access = 'private') DEFAULT_FILE_ENCODING = 'UTF-8'; DEFAULT_EOR = '\r\n'; end methods (Static) function this = create(files, splitSize, fileEncoding, eor, fileSizes) %CREATE Create a TextFileSplitter. import matlab.io.datastore.splitter.FileSizeBasedSplitter; import matlab.io.datastore.splitter.TextFileSplitter; narginchk(1,5); if nargin < 2 splitSize = FileSizeBasedSplitter.DEFAULT_SPLIT_SIZE; fileEncoding = TextFileSplitter.DEFAULT_FILE_ENCODING; eor = TextFileSplitter.DEFAULT_EOR; fileSizes = []; end if nargin < 3 fileEncoding = TextFileSplitter.DEFAULT_FILE_ENCODING; eor = TextFileSplitter.DEFAULT_EOR; fileSizes = []; end if nargin < 4 eor = TextFileSplitter.DEFAULT_EOR; fileSizes = []; end if nargin < 5 fileSizes = []; end % make fileEncoding, splitSize compatible [fileEncoding, newSplitSize] = ... TextFileSplitter.validateProps(fileEncoding, ... splitSize, true); % get FileSizeBasedSplitter constructor args [splits, newSplitSize] = ... FileSizeBasedSplitter.createArgs(files, newSplitSize, fileSizes); % construct TextFileSplitter this = TextFileSplitter(splits, newSplitSize, fileEncoding, eor); end function this = createFromSplits(splits) %CREATEFROMSPLITS Create a TextFileSplitter given existing splits. % This method is usally called from loadobj. import matlab.io.datastore.splitter.FileSizeBasedSplitter; import matlab.io.datastore.splitter.TextFileSplitter; narginchk(1,2); fileEncoding = TextFileSplitter.DEFAULT_FILE_ENCODING; eor = TextFileSplitter.DEFAULT_EOR; [splits, splitSize] = FileSizeBasedSplitter.createFromSplitsArgs(splits); this = TextFileSplitter(splits, splitSize, fileEncoding, eor); end end methods (Access = 'protected') function this = TextFileSplitter(splits, splitSize, fileEncoding, eor) this@matlab.io.datastore.splitter.FileSizeBasedSplitter(splits, splitSize); this.PrivateFileEncoding = fileEncoding; this.EOR = eor; end end methods % FileEncoding setter function set.FileEncoding(this, fileEncoding) import matlab.io.datastore.splitter.TextFileSplitter; if ~isempty(this.Splits) [fileEncoding, splitSize] = ... TextFileSplitter.validateProps(fileEncoding, this.SplitSize); % does nothing if the splitsize did not change. changeSplitSize(this, splitSize); this.PrivateFileEncoding = fileEncoding; end end % EOR setter function set.EOR(this, eor) this.EOR = validateEOR(eor); end % FileEncoding getter function fileEncoding = get.FileEncoding(this) fileEncoding = this.PrivateFileEncoding; end end methods function cp = createCopyWithSplits(this, splits) %CREATECOPYWITHSPLITS Create Splitter from existing Splits. % Creates a splitter that is identical except for the splits it % contains. Splits passed as input must be of identical in % structure to the splits used by this Spltiter class. cp = copy(this); cp.Splits = splits; end function rdr = createReader(this, ii) %CREATEREADER Create a reader for the ii-th split. rdr = matlab.io.datastore.splitreader.TextFileSplitReader; rdr.Split = this.Splits(ii); rdr.FileEncoding = this.FileEncoding; rdr.EOR = this.EOR; end function useFullFile(this, isFullFile) import matlab.io.datastore.splitter.FileSizeBasedSplitter; resizeSplits(this, isFullFile, FileSizeBasedSplitter.DEFAULT_SPLIT_SIZE, false); end function resizeHadoopSplits(this, splitSize) % resize hadoop splits to the provided splitSize % We want to force the resizing of splits for Hadoop splits isFullFile = isFullFileSplitter(this); resizeSplits(this, isFullFile, splitSize, true); end end methods (Access = private) function resizeSplits(this, isFullFile, splitSize, forceChange) if isFullFile % Full file assumes reading all of the data from the file % Resize to the maximum available from FileSize this.changeSplitSize(Inf, forceChange); else import matlab.io.datastore.splitter.TextFileSplitter; % Validate based on FileEncoding, etc [~, splitSize] = ... TextFileSplitter.validateProps(this.FileEncoding, splitSize); this.changeSplitSize(splitSize, forceChange); end end end methods (Static, Access = 'private') function [fileEncoding, splitSize] = validateProps(fileEncoding, splitSize, forSplitSizeForSeekableEnc) %VALIDATEPROPS Updates file encoding and split size. % This function returns a splitSize of Inf if the given % fileEncoding is non-seekable. For seekable encodings a non-Inf % splitSize is returned based on the input splitSize and % useGiveSplitSize. This function additionally returns the % canonical name. import matlab.io.datastore.internal.encodingStats; import matlab.io.datastore.splitter.FileSizeBasedSplitter; % do not enforce using the given split size by default if nargin < 3 forSplitSizeForSeekableEnc = false; end % get the canonical name encStats = encodingStats(fileEncoding); fileEncoding = encStats.CanonicalName; % non-seekable -> whole file splits. % seekable encoding -> 32 MB splits only when splitSize % provided is Inf and splitSize is allowed to be modified. if ~encStats.IsSeekable splitSize = Inf; elseif isinf(splitSize) && ~forSplitSizeForSeekableEnc splitSize = FileSizeBasedSplitter.DEFAULT_SPLIT_SIZE; end end end end function eor = validateEOR(eor) %VALIDATEEOR Validates a given end of record % This function is only responsible for validating a given end of record % and returning the unescaped version of it, if it is one of the standard % delimiters. % empty eor is not allowed if isempty(eor) || ~ischar(eor) error(message('MATLAB:datastoreio:tabulartextdatastore:invalidRowDelimiter')); end % check if its a standard eor, return the unsprintfed version [stdFlag, eor] = isStandardEOR(eor); % if non-standard, it must be a single character if ~stdFlag && 1 ~= numel(eor) error(message('MATLAB:datastoreio:tabulartextdatastore:invalidRowDelimiter')); end end function [ tf, eor ] = isStandardEOR(eor) %ISSTANDARDEOR checks if the given eor is standard. % This function compares the specified eor with the supported standard % eor's. It returns a flag indicating whether the given eor was valid. It % also returns a valid (un sprintffed) eor back for easy display in the % object. tf = true; if any(strcmp(eor, {'\r\n', sprintf('\r\n')})) eor = '\r\n'; elseif any(strcmp(eor, {'\n', sprintf('\n')})) eor = '\n'; elseif any(strcmp(eor, {'\r', sprintf('\r')})) eor = '\r'; else tf = false; end end