如何从表示坐标集合的字符串中提取数据?

问题描述 投票:0回答:4

我有一个长字符串,看起来像这样:

[(1, 1.89), (1, 3.93), (4, 8.65), (4, 9.35), (1, 2.4), (1, 2.37), (1, 2.14), (1, 2.35), (4, 7.46), (4, 9.21), (4, 9.21), (1, 1.48), (1, 2.96), (4, 10.39), (1, 2.69), (1, 1.25), (1, 2.39), (1, 3.08), (1, 2.81), (1, 2.72), (1, 1.05), (1, 4.73), (1, 1.7), (1, 4.49), (1, 2.25), (1, 2.73), (1, 3.69), (1, 3.23), (4, 7.78), (1, 2.7), (4, 7.47), (1, 2.01), (4, 7.41), (1, 2.4), (1, 1.81), (1, 2.75), (4, 8.35), (4, 7.88), (1, 2.33), (4, 9.26), (1, 2.74), (1, 1.43), (1, 3.14), (1, 4.44), (1, 2.51), (4, 6.85), (4, 10.14), (4, 8.86), (4, 9.45), (4, 8.37), (1, 3.73), (4, 8.33), (1, 3.51), (4, 7.74), (1, 3.65), (1, 2.89), (1, 4.44), (1, 1.46), (1, 3.2), (4, 8.78), (1, 3.46), (4, 8.55), (4, 7.92), (1, 3.53), (1, 3.4), (1, 3.05), (4, 9.84), (4, 10.72), (1, 2.35), (4, 9.04), (1, 2.49), (1, 3.17), (1, 2.92), (1, 1.54), (1, 1.05), (4, 7.55), (4, 6.87), (4, 8.1), (4, 8.3), (1, 1.59), (4, 7.95), (4, 9.74), (1, 3.38), (4, 9.26), (4, 9.49), (4, 9.17), (1, 1.7), (1, 1.05), (4, 8.34), (1, 3.79), (4, 7.61), (1, 1.24), (4, 9.3), (1, 1.46), (4, 7.14), (1, 3.88), (1, 3.71), (4, 8.5), (1, 1.05), (4, 7.8), (1, 2.43), (4, 7.84), (1, 1.72), (1, 3.72), (4, 7.78), (1, 1.34), (1, 1.85), (1, 2.15), (1, 1.64), (4, 8.25), (4, 8.06), (4, 9.28), (1, 1.05), (4, 8.85), (1, 1.91), (4, 8.31), (4, 8.69), (1, 1.93), (4, 8.26), (1, 1.96), (1, 3.09), (1, 2.7), (4, 9.07), (4, 9.12), (4, 9.62), (1, 3.98), (1, 1.05), (4, 8.31), (1, 2.35), (4, 7.07), (1, 4.01), (1, 1.05), (4, 8.04), (1, 1.73), (1, 1.21), (4, 8.66), (4, 10.87), (1, 4.13), (4, 8.75), (4, 7.61), (4, 7.44), (1, 1.05), (1, 2.81), (1, 3.27), (4, 9.52), (4, 8.67), (4, 8.93), (1, 1.05), (1, 3.36), (4, 9.08), (1, 2.8), (4, 7.95), (4, 9.65), (1, 3.09), (1, 3.25), (1, 3.11), (1, 2.6), (4, 8.99), (4, 8.02), (4, 8.84), (4, 8.22), (4, 8.71), (1, 2.23), (1, 2.25), (4, 8.67), (4, 9.06), (4, 6.9), (4, 8.85), (4, 8.29), (1, 3.07), (1, 4.18), (4, 7.63), (4, 8.88), (1, 2.33), (1, 1.84), (1, 2.95), (1, 3.26), (1, 2.61), (4, 6.2), (1, 3.41), (4, 9.61), (4, 8.38), (1, 2.34), (4, 10.3), (4, 6.93), (1, 2.64), (4, 8.97), (4, 7.21), (1, 3.13), (4, 7.7), (1, 2.5), (4, 9.13), (4, 8.45), (4, 8.35), (1, 3.14), (4, 6.59), (1, 2.99), (4, 6.6), (4, 7.69), (1, 1.64), (4, 8.44), (1, 2.79), (4, 6.7), (1, 3.35), (1, 3.39), (4, 7.57), (1, 3.34), (4, 9.68), (4, 7.94), (1, 1.92), (4, 9.16), (4, 8.67), (1, 1.05), (4, 9.6), (4, 8.21), (1, 3.15), (1, 1.58), (4, 9.36), (1, 2.01)]

我想以两个向量XY结尾(分别包含所有第一和所有第二值)。我被困在尝试从此字符串中删除括号和括号。

我曾尝试将此字符串复制到MATLAB并使用textscan去除尖括号。但是,我仍然无法删除方括号。

string matlab data-cleaning string-parsing text-extraction
4个回答
1
投票

也许您可以尝试regexp + str2double,即

[~, ~, ~, M] = regexp(s,'\d+(\.\d+)?');
m =str2double(M);
x = m(1:2:end);
y = m(2:2:end);

DATA

s = "[(1, 1.89), (1, 3.93), (4, 8.65), (4, 9.35), (1, 2.4), (1, 2.37), (1, 2.14), (1, 2.35), (4, 7.46), (4, 9.21), (4, 9.21), (1, 1.48), (1, 2.96), (4, 10.39), (1, 2.69), (1, 1.25), (1, 2.39), (1, 3.08), (1, 2.81), (1, 2.72), (1, 1.05), (1, 4.73), (1, 1.7), (1, 4.49), (1, 2.25), (1, 2.73), (1, 3.69), (1, 3.23), (4, 7.78), (1, 2.7), (4, 7.47), (1, 2.01), (4, 7.41), (1, 2.4), (1, 1.81), (1, 2.75), (4, 8.35), (4, 7.88), (1, 2.33), (4, 9.26), (1, 2.74), (1, 1.43), (1, 3.14), (1, 4.44), (1, 2.51), (4, 6.85), (4, 10.14), (4, 8.86), (4, 9.45), (4, 8.37), (1, 3.73), (4, 8.33), (1, 3.51), (4, 7.74), (1, 3.65), (1, 2.89), (1, 4.44), (1, 1.46), (1, 3.2), (4, 8.78), (1, 3.46), (4, 8.55), (4, 7.92), (1, 3.53), (1, 3.4), (1, 3.05), (4, 9.84), (4, 10.72), (1, 2.35), (4, 9.04), (1, 2.49), (1, 3.17), (1, 2.92), (1, 1.54), (1, 1.05), (4, 7.55), (4, 6.87), (4, 8.1), (4, 8.3), (1, 1.59), (4, 7.95), (4, 9.74), (1, 3.38), (4, 9.26), (4, 9.49), (4, 9.17), (1, 1.7), (1, 1.05), (4, 8.34), (1, 3.79), (4, 7.61), (1, 1.24), (4, 9.3), (1, 1.46), (4, 7.14), (1, 3.88), (1, 3.71), (4, 8.5), (1, 1.05), (4, 7.8), (1, 2.43), (4, 7.84), (1, 1.72), (1, 3.72), (4, 7.78), (1, 1.34), (1, 1.85), (1, 2.15), (1, 1.64), (4, 8.25), (4, 8.06), (4, 9.28), (1, 1.05), (4, 8.85), (1, 1.91), (4, 8.31), (4, 8.69), (1, 1.93), (4, 8.26), (1, 1.96), (1, 3.09), (1, 2.7), (4, 9.07), (4, 9.12), (4, 9.62), (1, 3.98), (1, 1.05), (4, 8.31), (1, 2.35), (4, 7.07), (1, 4.01), (1, 1.05), (4, 8.04), (1, 1.73), (1, 1.21), (4, 8.66), (4, 10.87), (1, 4.13), (4, 8.75), (4, 7.61), (4, 7.44), (1, 1.05), (1, 2.81), (1, 3.27), (4, 9.52), (4, 8.67), (4, 8.93), (1, 1.05), (1, 3.36), (4, 9.08), (1, 2.8), (4, 7.95), (4, 9.65), (1, 3.09), (1, 3.25), (1, 3.11), (1, 2.6), (4, 8.99), (4, 8.02), (4, 8.84), (4, 8.22), (4, 8.71), (1, 2.23), (1, 2.25), (4, 8.67), (4, 9.06), (4, 6.9), (4, 8.85), (4, 8.29), (1, 3.07), (1, 4.18), (4, 7.63), (4, 8.88), (1, 2.33), (1, 1.84), (1, 2.95), (1, 3.26), (1, 2.61), (4, 6.2), (1, 3.41), (4, 9.61), (4, 8.38), (1, 2.34), (4, 10.3), (4, 6.93), (1, 2.64), (4, 8.97), (4, 7.21), (1, 3.13), (4, 7.7), (1, 2.5), (4, 9.13), (4, 8.45), (4, 8.35), (1, 3.14), (4, 6.59), (1, 2.99), (4, 6.6), (4, 7.69), (1, 1.64), (4, 8.44), (1, 2.79), (4, 6.7), (1, 3.35), (1, 3.39), (4, 7.57), (1, 3.34), (4, 9.68), (4, 7.94), (1, 1.92), (4, 9.16), (4, 8.67), (1, 1.05), (4, 9.6), (4, 8.21), (1, 3.15), (1, 1.58), (4, 9.36), (1, 2.01)]";

1
投票

幸运的是,您的数据结构良好,因此您可以做几件事:

function vec = q60595838(s)
if ~nargin
  s = "[(1, 1.89), (1, 3.93), (4, 8.65), (4, 9.35), (1, 2.4), (1, 2.37), (1, 2.14), (1, 2.35), (4, 7.46), (4, 9.21), (4, 9.21), (1, 1.48), (1, 2.96), (4, 10.39), (1, 2.69), (1, 1.25), (1, 2.39), (1, 3.08), (1, 2.81), (1, 2.72), (1, 1.05), (1, 4.73), (1, 1.7), (1, 4.49), (1, 2.25), (1, 2.73), (1, 3.69), (1, 3.23), (4, 7.78), (1, 2.7), (4, 7.47), (1, 2.01), (4, 7.41), (1, 2.4), (1, 1.81), (1, 2.75), (4, 8.35), (4, 7.88), (1, 2.33), (4, 9.26), (1, 2.74), (1, 1.43), (1, 3.14), (1, 4.44), (1, 2.51), (4, 6.85), (4, 10.14), (4, 8.86), (4, 9.45), (4, 8.37), (1, 3.73), (4, 8.33), (1, 3.51), (4, 7.74), (1, 3.65), (1, 2.89), (1, 4.44), (1, 1.46), (1, 3.2), (4, 8.78), (1, 3.46), (4, 8.55), (4, 7.92), (1, 3.53), (1, 3.4), (1, 3.05), (4, 9.84), (4, 10.72), (1, 2.35), (4, 9.04), (1, 2.49), (1, 3.17), (1, 2.92), (1, 1.54), (1, 1.05), (4, 7.55), (4, 6.87), (4, 8.1), (4, 8.3), (1, 1.59), (4, 7.95), (4, 9.74), (1, 3.38), (4, 9.26), (4, 9.49), (4, 9.17), (1, 1.7), (1, 1.05), (4, 8.34), (1, 3.79), (4, 7.61), (1, 1.24), (4, 9.3), (1, 1.46), (4, 7.14), (1, 3.88), (1, 3.71), (4, 8.5), (1, 1.05), (4, 7.8), (1, 2.43), (4, 7.84), (1, 1.72), (1, 3.72), (4, 7.78), (1, 1.34), (1, 1.85), (1, 2.15), (1, 1.64), (4, 8.25), (4, 8.06), (4, 9.28), (1, 1.05), (4, 8.85), (1, 1.91), (4, 8.31), (4, 8.69), (1, 1.93), (4, 8.26), (1, 1.96), (1, 3.09), (1, 2.7), (4, 9.07), (4, 9.12), (4, 9.62), (1, 3.98), (1, 1.05), (4, 8.31), (1, 2.35), (4, 7.07), (1, 4.01), (1, 1.05), (4, 8.04), (1, 1.73), (1, 1.21), (4, 8.66), (4, 10.87), (1, 4.13), (4, 8.75), (4, 7.61), (4, 7.44), (1, 1.05), (1, 2.81), (1, 3.27), (4, 9.52), (4, 8.67), (4, 8.93), (1, 1.05), (1, 3.36), (4, 9.08), (1, 2.8), (4, 7.95), (4, 9.65), (1, 3.09), (1, 3.25), (1, 3.11), (1, 2.6), (4, 8.99), (4, 8.02), (4, 8.84), (4, 8.22), (4, 8.71), (1, 2.23), (1, 2.25), (4, 8.67), (4, 9.06), (4, 6.9), (4, 8.85), (4, 8.29), (1, 3.07), (1, 4.18), (4, 7.63), (4, 8.88), (1, 2.33), (1, 1.84), (1, 2.95), (1, 3.26), (1, 2.61), (4, 6.2), (1, 3.41), (4, 9.61), (4, 8.38), (1, 2.34), (4, 10.3), (4, 6.93), (1, 2.64), (4, 8.97), (4, 7.21), (1, 3.13), (4, 7.7), (1, 2.5), (4, 9.13), (4, 8.45), (4, 8.35), (1, 3.14), (4, 6.59), (1, 2.99), (4, 6.6), (4, 7.69), (1, 1.64), (4, 8.44), (1, 2.79), (4, 6.7), (1, 3.35), (1, 3.39), (4, 7.57), (1, 3.34), (4, 9.68), (4, 7.94), (1, 1.92), (4, 9.16), (4, 8.67), (1, 1.05), (4, 9.6), (4, 8.21), (1, 3.15), (1, 1.58), (4, 9.36), (1, 2.01)]";
end

%% Method 1:
s1 = strsplit(s, "), ");
s2 = erase(s1, ["[", "(", ")", "]", ","]);
vec = double(split(s2.', " "));

%% Method 2:
ast = py.importlib.import_module('ast');
mapper = @(item)[double(item{1}), double(item{2})];
vec = cell2mat(cellfun( mapper , cell(ast.literal_eval(s)), 'UniformOutput', false).');

其中后者要求安装Python并被MATLAB识别。


1
投票

如果您想使用textscan,也可以这样做:

test_txt = ['(1, 1.89), (1, 3.93), (4, 8.65), (4, 9.35), (1, 2.4), (1, 2.37), (1, 2.14), (1, 2.35), (4, 7.46), (4, 9.21), (4, 9.21), (1, 1.48), (1, 2.96), (4, 10.39), (1, 2.69), (1, 1.25), (1, 2.39), (1, 3.08), (1, 2.81), (1, 2.72), (1, 1.05), (1, 4.73), (1, 1.7), (1, 4.49), (1, 2.25), (1, 2.73), (1, 3.69), (1, 3.23), (4, 7.78), (1, 2.7), (4, 7.47), (1, 2.01), (4, 7.41), (1, 2.4), (1, 1.81), (1, 2.75), (4, 8.35), (4, 7.88), (1, 2.33), (4, 9.26), (1, 2.74), (1, 1.43), (1, 3.14), (1, 4.44), (1, 2.51), (4, 6.85), (4, 10.14), (4, 8.86), (4, 9.45), (4, 8.37), (1, 3.73), (4, 8.33), (1, 3.51), (4, 7.74), (1, 3.65), (1, 2.89), (1, 4.44), (1, 1.46), (1, 3.2), (4, 8.78), (1, 3.46), (4, 8.55), (4, 7.92), (1, 3.53), (1, 3.4), (1, 3.05), (4, 9.84), (4, 10.72), (1, 2.35), (4, 9.04), (1, 2.49), (1, 3.17), (1, 2.92), (1, 1.54), (1, 1.05), (4, 7.55), (4, 6.87), (4, 8.1), (4, 8.3), (1, 1.59), (4, 7.95), (4, 9.74), (1, 3.38), (4, 9.26), (4, 9.49), (4, 9.17), (1, 1.7), (1, 1.05), (4, 8.34), (1, 3.79), (4, 7.61), (1, 1.24), (4, 9.3), (1, 1.46), (4, 7.14), (1, 3.88), (1, 3.71), (4, 8.5), (1, 1.05), (4, 7.8), (1, 2.43), (4, 7.84), (1, 1.72), (1, 3.72), (4, 7.78), (1, 1.34), (1, 1.85), (1, 2.15), (1, 1.64), (4, 8.25), (4, 8.06), (4, 9.28), (1, 1.05), (4, 8.85), (1, 1.91), (4, 8.31), (4, 8.69), (1, 1.93), (4, 8.26), (1, 1.96), (1, 3.09), (1, 2.7), (4, 9.07), (4, 9.12), (4, 9.62), (1, 3.98), (1, 1.05), (4, 8.31), (1, 2.35), (4, 7.07), (1, 4.01), (1, 1.05), (4, 8.04), (1, 1.73), (1, 1.21), (4, 8.66), (4, 10.87), (1, 4.13), (4, 8.75), (4, 7.61), (4, 7.44), (1, 1.05), (1, 2.81), (1, 3.27), (4, 9.52), (4, 8.67), (4, 8.93), (1, 1.05), (1, 3.36), (4, 9.08), (1, 2.8), (4, 7.95), (4, 9.65), (1, 3.09), (1, 3.25), (1, 3.11), (1, 2.6), (4, 8.99), (4, 8.02), (4, 8.84), (4, 8.22), (4, 8.71), (1, 2.23), (1, 2.25), (4, 8.67), (4, 9.06), (4, 6.9), (4, 8.85), (4, 8.29), (1, 3.07), (1, 4.18), (4, 7.63), (4, 8.88), (1, 2.33), (1, 1.84), (1, 2.95), (1, 3.26), (1, 2.61), (4, 6.2), (1, 3.41), (4, 9.61), (4, 8.38), (1, 2.34), (4, 10.3), (4, 6.93), (1, 2.64), (4, 8.97), (4, 7.21), (1, 3.13), (4, 7.7), (1, 2.5), (4, 9.13), (4, 8.45), (4, 8.35), (1, 3.14), (4, 6.59), (1, 2.99), (4, 6.6), (4, 7.69), (1, 1.64), (4, 8.44), (1, 2.79), (4, 6.7), (1, 3.35), (1, 3.39), (4, 7.57), (1, 3.34), (4, 9.68), (4, 7.94), (1, 1.92), (4, 9.16), (4, 8.67), (1, 1.05), (4, 9.6), (4, 8.21), (1, 3.15), (1, 1.58), (4, 9.36), (1, 2.01)'] ;
s = size(test_txt,2) ;
N = floor(s/9) ;
C = textscan(test_txt,'(%d,%.2f),',N) ;
x = C{1} ;
y = C{2} ;

我尝试过,我认为它可以按您的要求工作(我检查了x和y对的第一个和最后一个值,它们与test_txt中的值相同)。假设将char向量N的大小除以一个括号对s的大小(其中有9个字符,包括括号和小写字母后的空格)来计算(1, 1.11)

PS:

(编辑!!)

注意,仅当所有对的配对格式相同时,以上N的计算才有效!但是这里不是这种情况(例如,第四对是(1, 2.4),而先前的计算s/9仅在将其格式化为(1, 2.40)时才是一致的。)

定义N的更好方法是像这样计算开括号(

index_bracket = find(test_txt(:)=='(') ;
N = size(index_bracket,1) ;

这应该可以工作(对不起,谢谢!)!

[我认为您可以调整Dev-iL的解决方案,使其变得更简单,更快

s1 = extractBetween(s, "(",")");
s1 = split(s1,", ",2);
vec = double(s1);

关于踢球,我对我们的解决方案进行了基准测试。我没有从Kiwi GM的解决方案中得到正确的答案,所以我将其遗漏了。

Dev-iL's performance 3.93
ThomasIsCoding's performance 68.6
My performance 0.873

function profFunc

s = "[(1, 1.89), (1, 3.93), (4, 8.65), (4, 9.35), (1, 2.4), (1, 2.37), (1, 2.14), (1, 2.35), (4, 7.46), (4, 9.21), (4, 9.21), (1, 1.48), (1, 2.96), (4, 10.39), (1, 2.69), (1, 1.25), (1, 2.39), (1, 3.08), (1, 2.81), (1, 2.72), (1, 1.05), (1, 4.73), (1, 1.7), (1, 4.49), (1, 2.25), (1, 2.73), (1, 3.69), (1, 3.23), (4, 7.78), (1, 2.7), (4, 7.47), (1, 2.01), (4, 7.41), (1, 2.4), (1, 1.81), (1, 2.75), (4, 8.35), (4, 7.88), (1, 2.33), (4, 9.26), (1, 2.74), (1, 1.43), (1, 3.14), (1, 4.44), (1, 2.51), (4, 6.85), (4, 10.14), (4, 8.86), (4, 9.45), (4, 8.37), (1, 3.73), (4, 8.33), (1, 3.51), (4, 7.74), (1, 3.65), (1, 2.89), (1, 4.44), (1, 1.46), (1, 3.2), (4, 8.78), (1, 3.46), (4, 8.55), (4, 7.92), (1, 3.53), (1, 3.4), (1, 3.05), (4, 9.84), (4, 10.72), (1, 2.35), (4, 9.04), (1, 2.49), (1, 3.17), (1, 2.92), (1, 1.54), (1, 1.05), (4, 7.55), (4, 6.87), (4, 8.1), (4, 8.3), (1, 1.59), (4, 7.95), (4, 9.74), (1, 3.38), (4, 9.26), (4, 9.49), (4, 9.17), (1, 1.7), (1, 1.05), (4, 8.34), (1, 3.79), (4, 7.61), (1, 1.24), (4, 9.3), (1, 1.46), (4, 7.14), (1, 3.88), (1, 3.71), (4, 8.5), (1, 1.05), (4, 7.8), (1, 2.43), (4, 7.84), (1, 1.72), (1, 3.72), (4, 7.78), (1, 1.34), (1, 1.85), (1, 2.15), (1, 1.64), (4, 8.25), (4, 8.06), (4, 9.28), (1, 1.05), (4, 8.85), (1, 1.91), (4, 8.31), (4, 8.69), (1, 1.93), (4, 8.26), (1, 1.96), (1, 3.09), (1, 2.7), (4, 9.07), (4, 9.12), (4, 9.62), (1, 3.98), (1, 1.05), (4, 8.31), (1, 2.35), (4, 7.07), (1, 4.01), (1, 1.05), (4, 8.04), (1, 1.73), (1, 1.21), (4, 8.66), (4, 10.87), (1, 4.13), (4, 8.75), (4, 7.61), (4, 7.44), (1, 1.05), (1, 2.81), (1, 3.27), (4, 9.52), (4, 8.67), (4, 8.93), (1, 1.05), (1, 3.36), (4, 9.08), (1, 2.8), (4, 7.95), (4, 9.65), (1, 3.09), (1, 3.25), (1, 3.11), (1, 2.6), (4, 8.99), (4, 8.02), (4, 8.84), (4, 8.22), (4, 8.71), (1, 2.23), (1, 2.25), (4, 8.67), (4, 9.06), (4, 6.9), (4, 8.85), (4, 8.29), (1, 3.07), (1, 4.18), (4, 7.63), (4, 8.88), (1, 2.33), (1, 1.84), (1, 2.95), (1, 3.26), (1, 2.61), (4, 6.2), (1, 3.41), (4, 9.61), (4, 8.38), (1, 2.34), (4, 10.3), (4, 6.93), (1, 2.64), (4, 8.97), (4, 7.21), (1, 3.13), (4, 7.7), (1, 2.5), (4, 9.13), (4, 8.45), (4, 8.35), (1, 3.14), (4, 6.59), (1, 2.99), (4, 6.6), (4, 7.69), (1, 1.64), (4, 8.44), (1, 2.79), (4, 6.7), (1, 3.35), (1, 3.39), (4, 7.57), (1, 3.34), (4, 9.68), (4, 7.94), (1, 1.92), (4, 9.16), (4, 8.67), (1, 1.05), (4, 9.6), (4, 8.21), (1, 3.15), (1, 1.58), (4, 9.36), (1, 2.01)]";
n = 10000;

tic;
for i = 1:n
    s1 = strsplit(s, "), ");
    s2 = erase(s1, ["[", "(", ")", "]", ","]);
    vec = double(split(s2.', " "));
end
fprintf("Dev-iL's performance %.3g\n", toc);

tic;
for i = 1:n
    [~, ~, ~, M] = regexp(s,'\d+(\.\d+)?');
    m = str2double(M);
    x = m(1:2:end);
    y = m(2:2:end);
end
fprintf("ThomasIsCoding's performance %.3g\n", toc);

tic;
for i = 1:n
    s1 = extractBetween(s, "(",")");
    s1 = split(s1,", ",2);
    vec = double(s1);
end
fprintf("My performance %.3g\n", toc);

0
投票

[我认为您可以调整Dev-iL的解决方案,使其变得更简单,更快

© www.soinside.com 2019 - 2024. All rights reserved.