www.gusucode.com > datastoreio工具箱 matlab源码程序 > datastoreio/+matlab/+io/+datastore/+splitter/MatKVFileSplitter.m
classdef MatKVFileSplitter < matlab.io.datastore.splitter.FileBasedSplitter %MATKVFILESPLITTER Splitter for splitting key value mat files. % A splitter that creates splits from all the mat files provided that % contain key value pairs. All the mat files must have two variables, % 'Key' and 'Value'. 'Key' can either be a cell array of strings or a % numeric vector of length equal to number of keys. 'Values' are always % a cell array of length equal to number of keys. % % See also - matlab.io.datastore.KeyValueDatastore % Copyright 2015-2016 The MathWorks, Inc. properties (Constant, Access = private) % Default key value split size, one split can have at most. DEFAULT_KV_SPLIT_SIZE = 1000; % Allowed key and value variable names in the MAT-Files provided. MAT_FILE_KEY_VALUE_VARIABLES = {'Key', 'Value'}; % From 15a we added a SchemaVersion variable to the MAT-Files MAT_FILE_THREE_VARIABLES = {'Key', 'SchemaVersion', 'Value'}; % Allowed value variable names in the MAT-Files provided. % This is to support TallDatastore with only Values. MAT_FILE_VALUE_VARIABLES = {'SchemaVersion', 'Value'}; % Filename suffix for TallDatastore MAT-files SNAPSHOT_SUFFIX_STR = 'snapshot'; HEX_PREFIX_STR = '0x'; end methods (Static) function splitter = create(fileInfo, kvsplitsize) narginchk(1,2); files = fileInfo.Files; if ischar(files) files = { files }; end splits = []; import matlab.io.datastore.splitter.MatKVFileSplitter; if nargin == 1 kvsplitsize = MatKVFileSplitter.DEFAULT_KV_SPLIT_SIZE; end splitter = MatKVFileSplitter; splitter.SplitSizeLimit = kvsplitsize; if ~isempty(files) if ~iscellstr(files) error(message('MATLAB:datastoreio:filesplitter:invalidFilesInput')); end [splits, fileInfo] = getSplitsFromInfo(fileInfo, kvsplitsize); end splitter.FileSizes = fileInfo.FileSizes; splitter.Files = fileInfo.Files; splitter.Splits = splits; end function splitter = createFromSplits(splits) % Create a splitter from given splits import matlab.io.datastore.splitter.MatKVFileSplitter; splitter = MatKVFileSplitter; splitter.SplitSizeLimit = MatKVFileSplitter.DEFAULT_KV_SPLIT_SIZE; splitter.Files = {}; splitter.FileSizes = []; if ~isempty(splits) if ~isstruct(splits) || ... ~isempty(setdiff({'File','Offset','Size','FileIndex', 'SchemaAvailable', 'ValuesOnly'},fieldnames(splits))) error(message('MATLAB:datastoreio:filesplitter:invalidSplits')); end % use unique file indices to set the new FileIndex'es [~, idxs, ia] = unique([splits.FileIndex], 'stable'); splitter.Files = {splits(idxs).File}'; splitter.FileSizes = [splits(idxs).Size]; for ii = 1:numel(splits) splits(ii).FileIndex = ia(ii); end splitter.Splits = splits; end end end methods (Static, Hidden) function tf = verifyKeysValues(fileInfo, valuesOnly) % Check if all the mat files have MAT_FILE_NUM_VARIABLES number of % variables, variable names equal to MAT_FILE_KEY_VALUE_VARIABLES, and % they are column vectors of same length. import matlab.io.datastore.splitter.MatKVFileSplitter; if valuesOnly tf = MatKVFileSplitter.verifyValuesOnly(fileInfo); return; end numVars = numel(fileInfo); tf = false; switch numVars % Number of variables must be 2 or 3 % Variable names must be MAT_FILE_KEY_VALUE_VARIABLES % or MAT_FILE_THREE_VARIABLES case 2 tf = all(strcmp({fileInfo.name}, MatKVFileSplitter.MAT_FILE_KEY_VALUE_VARIABLES)); case 3 tf = all(strcmp({fileInfo.name}, MatKVFileSplitter.MAT_FILE_THREE_VARIABLES)); otherwise return; end if ~tf return; end ks = fileInfo(1).size; vs = fileInfo(numVars).size; % Variables must be column vectors of same length. tf = numel(ks) == 2 && numel(vs) == 2 && ... ks(2) == 1 && vs(2) == 1 && ks(1) == vs(1); end % Verify the Value shape in the MAT-file function tf = verifyValuesOnly(fileInfo) import matlab.io.datastore.splitter.MatKVFileSplitter; numVars = numel(fileInfo); tf = false; if ~all(strcmp({fileInfo.name}, MatKVFileSplitter.MAT_FILE_VALUE_VARIABLES)) return; end vs = fileInfo(numVars).size; % Variable Value must be a column vector. tf = numel(vs) == 2 && vs(2) == 1; end % Check if a MAT-file is supported % matFileInfo is a struct from whos function function [tf, matFileInfo] = isMatSupported(filename, valuesOnly) % Use whos to introspect size and variable names in a mat file. % Constructing a matfile object to find the sizes and the variable % names would take double the time, compared to whos. import matlab.io.datastore.splitter.MatKVFileSplitter; tf = false; matFileInfo = []; warningId = 'MATLAB:whos:UnableToRead'; warning('off', warningId); c = onCleanup(@() warning('on', warningId)); try matFileInfo = whos('-file', filename); if ~isempty(matFileInfo) tf = MatKVFileSplitter.verifyKeysValues(matFileInfo, valuesOnly); end catch e % swallow the error and return end end % filter MAT-files that are supported % FileInfo contains information needed to create splits function [fileInfo, tf, idx] = filterMatFiles(files, valuesOnly) import matlab.io.datastore.splitter.MatKVFileSplitter; fileInfo = []; tf = true; idx = -1; numFiles = numel(files); isMat = false(numFiles, 1); fileSizes = zeros(numFiles, 1); schemaAvailable = false(numFiles, 1); valuesOnlyAvailable = false(numFiles, 1); for ii = 1:numFiles if valuesOnly info = iParseValuesOnlyFilename(files{ii}); else [~, info] = MatKVFileSplitter.isMatSupported(files{ii}, valuesOnly); end if ~isempty(info) isMat(ii) = true; if valuesOnly schemaAvailable(ii) = true; valuesOnlyAvailable(ii) = true; % Get the Value size fileSizes(ii) = info.size; else % Get the Key size fileSizes(ii) = info(1).size(1); if numel(info) == 3 % SchemaVersion is available from 15a schemaAvailable(ii) = true; end end elseif nargout > 1 % No need to fillout the fileInfo % Return with index of the file that's not supported tf = false; idx = ii; return; end end fileInfo.Files = files(isMat); fileInfo.FileSizes = fileSizes(isMat); fileInfo.SchemaAvailable = schemaAvailable(isMat); fileInfo.ValuesOnlyAvailable= valuesOnlyAvailable(isMat); end end properties (GetAccess = public, SetAccess = private) % Mat Files containing key-value pairs. Files; end properties % Maximum size of each split. SplitSizeLimit; % Sizes of all files FileSizes; % KeyValueLimit for SplitReaders KeyValueLimit; end methods (Hidden) function data = readAllSplits(splitter) % Read all of the data from all the splits % This uses ValuesOnly boolean from the split information % to decide if only Values to be read from MAT-Files or not. warning('off', 'MATLAB:MatFile:OlderFormat'); c = onCleanup(@()warning('on', 'MATLAB:MatFile:OlderFormat')); data = table; if isempty(splitter.Files) || isempty(splitter.Splits) return; end numSplits = numel(splitter.Splits); datasize = 0; splitSizeLimit = splitter.SplitSizeLimit; datacumsizes = zeros(1, numSplits); for ii = 1:numSplits split = splitter.Splits(ii); splitSize = splitSizeLimit; endSize = split.Size - split.Offset + 1; if endSize < splitSize splitSize = endSize; end datasize = datasize + splitSize; if ii == 1 datacumsizes(ii) = splitSize; else datacumsizes(ii) = datacumsizes(ii-1) + splitSize; end end import matlab.io.datastore.splitreader.MatKVFileSplitReader; import matlab.io.datastore.splitter.MatKVFileSplitter; rdr = MatKVFileSplitReader(numel(splitter.Files), splitSizeLimit, splitSizeLimit); rdr.Split = splitter.Splits(1); valuesOnly = rdr.Split.ValuesOnly; reset(rdr); [Key, Value] = readFullSplit(rdr, datacumsizes(1)); if ~valuesOnly % Keys are not needed if ValuesOnly, for example in case of TallDatastore if iscell(Key) data.Key = cell(datasize, 1); elseif isnumeric(Key) data.Key = zeros(datasize, 1, 'like', Key); end end if iscell(Value) data.Value = cell(datasize, 1); elseif isnumeric(Value) data.Value = zeros(datasize, 1, 'like', Value); end keyClass = class(Key); valueClass = class(Value); if ~valuesOnly % Keys are not needed if ValuesOnly, for example in case of TallDatastore data.Key(1:datacumsizes(1), 1) = Key; end data.Value(1:datacumsizes(1), 1) = Value; for ii = 2:numSplits splitSize = datacumsizes(ii) - datacumsizes(ii-1); rdr.Split = splitter.Splits(ii); reset(rdr); [Key, Value] = readFullSplit(rdr, splitSize); stidx = datacumsizes(ii-1) + 1; if ~valuesOnly % Keys are not needed if ValuesOnly, for example in case of TallDatastore try data.Key(stidx:datacumsizes(ii), 1) = Key; catch e MatKVFileSplitter.invalidKeyValueError(keyClass, class(Key),... splitter.Splits(1).File, splitter.Splits(ii).File, e, true); end end try data.Value(stidx:datacumsizes(ii), 1) = Value; catch e MatKVFileSplitter.invalidKeyValueError(valueClass, class(Value),... splitter.Splits(1).File, splitter.Splits(ii).File, e, false); end end end % set all splits to have the SchemaAvailable field to the % given boolean. function setSchemaAvailable(splitter, tf) [splitter.Splits.SchemaAvailable] = deal(tf); end % set all splits to have the ValuesOnly field to the % given boolean. function setSplitsWithValuesOnly(splitter, tf) [splitter.Splits.ValuesOnly] = deal(tf); end % A MATKVFileSplitter uses chunked splits. function tf = isFullFileSplitter(~) tf = false; end %isSplitsOverAllOfFiles Returns true if a splitter splits is guaranteed to cover all of Files property. % A FileBasedSplitter that has been partitioned cannot guarantee that % the contained collection of splits is equivalent to creating a new % splitter from the Files property. This method allows clients of % FileBasedSplitter to guard against this. function tf = isSplitsOverAllOfFiles(splitter) tf = true; splits = splitter.Splits; if numel(splits) == 0 return; end [~, ia, ic] = unique([splits.FileIndex], 'stable'); % Accumulate splitSizes for each unique file index fileSizes = accumarray(ic, ones(size(ic)) * splitter.SplitSizeLimit); for ii = 1:numel(fileSizes) if fileSizes(ii) < splits(ia(ii)).Size tf = false; return; end end end % Return a reader for the ii-th split. function rdr = createReader(splitter, ii) rdr = matlab.io.datastore.splitreader.MatKVFileSplitReader(... numel(splitter.Files), splitter.KeyValueLimit, splitter.SplitSizeLimit); rdr.Split = splitter.Splits(ii); end % Create Splitter from existing Splits % % Splits passed as input must be of identical in structure to the % splits used by this Spltiter class. function splitterCopy = createCopyWithSplits(splitter, splits) splitterCopy = splitter.createFromSplits(splits); splitterCopy.KeyValueLimit = splitter.KeyValueLimit; end end methods (Static, Access = private) function invalidKeyValueError(c1, c2, f1, f2, e, keyError) if strcmp(e.identifier, 'MATLAB:invalidConversion') msgid = 'MATLAB:datastoreio:keyvaluedatastore:invalidKeyConversion'; if ~keyError msgid = 'MATLAB:datastoreio:keyvaluedatastore:invalidValueConversion'; end msg = message(msgid, c1, c2, f1, f2); throw(MException(msg)); end throw(e); end end end % Using the fileinfo generated during initializaiton create splits function [splits, fileInfo] = getSplitsFromInfo(fileInfo, kvsplitsize) numFiles = numel(fileInfo.Files); splits = cell(numFiles, 1); for ii = 1:numFiles numkv = fileInfo.FileSizes(ii); offsets = 1:kvsplitsize:numkv; % No need to check if nsplits == 0, since empty files are filtered out nsplits = numel(offsets); % Use FileIndex in MatKVFileSplitReader to cache mat % file objects, as they are expensive proportional to % the number of key value pairs in the mat file. filenames = repmat(fileInfo.Files(ii), 1, nsplits); sizes = repmat(numkv, 1, nsplits); fileIdcs = repmat(ii, 1, nsplits); schAvails = repmat(fileInfo.SchemaAvailable(ii), 1, nsplits); valuesOnly = repmat(fileInfo.ValuesOnlyAvailable(ii), 1, nsplits); splits{ii} = struct('File', filenames, 'Size', num2cell(sizes), ... 'Offset', num2cell(offsets), 'FileIndex', num2cell(fileIdcs), ... 'SchemaAvailable', num2cell(schAvails), 'ValuesOnly', num2cell(valuesOnly)); end splits = [splits{:}]; end % Parse the filename when ValuesOnly is true - for TallDatastore. % For example: % From tall/write, filename could be of the form 'array_r10_1_snapshot_8A' % Here '8A' represents the number of values in the MAT-file in hex form. % - outInfo is a struct with the above number, when the filename matches the pattern % - outInfo is a struct from whos, when the filename does not match the above pattern. function outInfo = iParseValuesOnlyFilename(filename) import matlab.io.datastore.splitter.MatKVFileSplitter; [~,name,ext] = fileparts(filename); pattern = ['\w*_',... MatKVFileSplitter.SNAPSHOT_SUFFIX_STR,'_',... MatKVFileSplitter.HEX_PREFIX_STR,... '(\w*)$']; numKV = regexp(name, pattern, 'tokens', 'once'); outInfo = []; if strcmp(ext, '.mat') && ~isempty(numKV) && ~isempty(numKV{1}) try % convert the hex number of values to decimal outInfo.size = hex2dec(numKV{1}); catch % swallow and use legacy support check end end if isempty(outInfo) % Legacy MAT-file support check: % Use whos to find if the MAT-file is supported. % We reach here if the filenames are changed manually % or MAT-file is constructed manually % or if the file is not from tall/write [~, info] = MatKVFileSplitter.isMatSupported(filename, true); if ~isempty(info) outInfo.size = info(2).size(1); end end end