使用SAS BASE从JSON中的变量中解析具有unicode字符的表

问题描述 投票:12回答:1

我在使用vars中的unicode char解析JSON时遇到了问题。所以,我有下一个JSON(例子):

 {  
   "SASJSONExport":"1.0",
   "SASTableData+TEST":[  
      {  
         "\u041f\u0435\u0440\u0435\u043c\u0435\u043d\u043d\u0430\u044f":2,
         "\u0421\u0440\u0435\u0434\u043d\u0435\u0435":4,
         "\u0421\u0442\u0440\u043e\u043a\u0430":"\u0427\u0442\u043e\u002d\u0442\u043e\u0031"
      },
      {  
         "\u041f\u0435\u0440\u0435\u043c\u0435\u043d\u043d\u0430\u044f":2,
         "\u0421\u0440\u0435\u0434\u043d\u0435\u0435":2,
         "\u0421\u0442\u0440\u043e\u043a\u0430":"\u0427\u0442\u043e\u002d\u0442\u043e\u0032"
      },
      {  
         "\u041f\u0435\u0440\u0435\u043c\u0435\u043d\u043d\u0430\u044f":1,
         "\u0421\u0440\u0435\u0434\u043d\u0435\u0435":42,
         "\u0421\u0442\u0440\u043e\u043a\u0430":"\u0427\u0442\u043e\u002d\u0442\u043e\u0033"
      }
   ]
}

要从JSON解析表,我使用SAS引擎:

libname jsonfl JSON fileref=injson ;

代码更高解码单元格中的字符,但vars的名称看起来像缺少的val:

+--------------+---------------------------+------------+---------+---------+
| ordinal_root | ordinal_SASTableData_TEST | __________ | _______ | ______  |
+--------------+---------------------------+------------+---------+---------+
|            1 |                         1 |          2 |       4 | Что-то1 |
|            1 |                         2 |          2 |       2 | Что-то2 |
|            1 |                         3 |          1 |      42 | Что-то3 |
+--------------+---------------------------+------------+---------+---------+

标题必须如下所示:

+--------------+---------------------------+------------+---------+---------+
| ordinal_root | ordinal_SASTableData_TEST | Переменная | Среднее | Строка  |
+--------------+---------------------------+------------+---------+---------+

所以我决定用这个DIM_N_这样的名字替换unicoded变量字符。为此,我必须找到所有字符串,与下一个正则表达式一致:/([\s\w\d\\]+)\"\:/

但是,要从json获取字符串,我需要设置为delim下一个char '{','}','[',']',','。但如果把那个字符设为dlm,我就不会再组装json了。所以我决定在char ~之前粘贴以将其设置为dlm。

data delim;
    infile injson lrecl=1073741823 nopad;
    file  delim;
    input char1 $char1. @@;
        if char1 in ('{','}','[',']',',') then
            put '7E'x;
        put char1 $CHAR1. @@;
run;

我得到了无效的json文件:

~
{"SASJSONExport":"1.0"~
,"SASTableData+TEST":~
[  ~
{"\u0056\u0061\u0072":2~
,"\u006d\u0065\u0061\u006e":4~
,"\u004e\u0061\u006d\u0065":"\u0073\u006d\u0074\u0068\u0031"~
}~
,  ~
{"\u0056\u0061\u0072":2~
,"\u006d\u0065\u0061\u006e":2~
,"\u004e\u0061\u006d\u0065":"\u0073\u006d\u0074\u0068\u0032"~
}~
,  ~
{"\u0056\u0061\u0072":1~
,"\u006d\u0065\u0061\u006e":42~
,"\u004e\u0061\u006d\u0065":"\u0073\u006d\u0074\u0068\u0033"~
}  ~
]~
}   

因此,下一步我将解析JSON并使用~作为分隔符:

data transfer;
length column $2000;
retain r;
    infile delim  delimiter='7E'x nopad;
    input char1 : $4000. @@;
            r = prxparse('/([\s\w\d\\]+)\"\:/');
            pos = prxmatch(r,char1);
            column = prxposn(r,1,char1);
        n= _n_;
run;

它有效...但我觉得那些做法太糟糕了,而且它有限制。

UPD1 选项,

options vAlidfmtname=long VALIDMEMNAME=extend VALIDVARNAME=any;

返回:

+--------------+---------------------------+----------------------------+---------+--------------+
| ordinal_root | ordinal_SASTableData_TEST |         __________         | _______ |    ______    |
+--------------+---------------------------+----------------------------+---------+--------------+
|            1 |                         1 | авфа2 фвафв = фвыа - тфвыа |       4 | Что-то1 ,,,, |
|            1 |                         2 | авфа2 фвафв = фвыа - тфвыа |       2 | Что-то2      |
|            1 |                         3 | авфа2 фвафв = фвыа - тфвыа |    2017 | Что-то3      |
+--------------+---------------------------+----------------------------+---------+--------------+

所以我的问题是:

  1. 我可以在没有infile语句的情况下解码整个文件吗?
  2. 我可以使用infile delimiter,但设置smth选项不删除分隔符?

欢迎充分的批评。

json regex parsing sas
1个回答
1
投票

UPD 我来到解决方案,而不必手动编辑json映射文件,但使用正则表达式。

libname _all_ clear;
filename _all_ clear;
filename _PDFOUT temp;
filename _GSFNAME temp;
proc datasets lib=work kill memtype=data nolist; quit;
filename jsf '~/sasuser.v94/.json' encoding='utf-8';
data _null_;
  file jsf;
  length js varchar(*);
  retain js;
  input;
  js=unicode(_infile_);
  put js;
  datalines;
{
  "SASJSONExport":"1.0",
  "SASTableData+TEST":[
    {
      "\u041f\u0435\u0440\u0435\u043c\u0435\u043d\u043d\u0430\u044f":2,
      "\u0421\u0440\u0435\u0434\u043d\u0435\u0435":4,
      "\u0421\u0442\u0440\u043e\u043a\u0430":"\u0427\u0442\u043e\u002d\u0442\u043e\u0031"
    },
    {
      "\u041f\u0435\u0440\u0435\u043c\u0435\u043d\u043d\u0430\u044f":2,
      "\u0421\u0440\u0435\u0434\u043d\u0435\u0435":2,
      "\u0421\u0442\u0440\u043e\u043a\u0430":"\u0427\u0442\u043e\u002d\u0442\u043e\u0032"
    },
    {
      "\u041f\u0435\u0440\u0435\u043c\u0435\u043d\u043d\u0430\u044f":1,
      "\u0421\u0440\u0435\u0434\u043d\u0435\u0435":42,
      "\u0421\u0442\u0440\u043e\u043a\u0430":"\u0427\u0442\u043e\u002d\u0442\u043e\u0033"
    }
  ]
}
;
run;
filename jsm '~/sasuser.v94/.json.map' encoding='utf-8';
libname jsd json fileref=jsf map=jsm automap=replace;
libname jsm json fileref=jsm;
data jsmm;
  merge jsm.datasets jsm.datasets_variables;
  by ordinal_DATASETS;
run;
proc sort data=jsmm; by ordinal_root ordinal_DATASETS; run;
data _null_;
  set work.jsmm end=last;
  if _N_=1 then do;
    length s varchar(*) ds varchar(*);
    retain s ds prx;
    s='{"DATASETS":[';
    ds='';
    prx=prxparse('/[^_]/');
  end;
  if ds=dsname then s=s||',';
  else do;
    ds=dsname;
    if _N_^=1 then s=s||']},';
    s=cats(s,'{"DSNAME":"',ds,'","TABLEPATH":"',tablepath,'","VARIABLES":[');
  end;
  s=cats(s,'{"NAME":"',name,'","TYPE":"',type,'","PATH":"',path,'"');
  if prxmatch(prx,name) > length(name) then
    s=cats(s,',"LABEL":"',scan(path,-1,'/'),'"');
  s=s||'}';
  if last then do;
    s=s||']}]}';
    file jsm;
    put s;
  end;
run;
libname jsd json fileref=jsf map=jsm;
proc print data=jsd.SASTableData_TEST label noobs; run;

解决方案的第一个变体 这是一个快速的解决方案。 首先准备输入数据:

libname _all_ clear;
filename _all_ clear;
filename jsf '~/sasuser.v94/.json' encoding='utf-8';
data _null_;
  file jsf;
  length js varchar(*);
  input;
  js=unicode(_infile_);
  put js;
  datalines;
{
  "SASJSONExport":"1.0",
  "SASTableData+TEST": [
    {
      "\u041f\u0435\u0440\u0435\u043c\u0435\u043d\u043d\u0430\u044f":2,
      "\u0421\u0440\u0435\u0434\u043d\u0435\u0435":4,
      "\u0421\u0442\u0440\u043e\u043a\u0430":"\u0427\u0442\u043e\u002d\u0442\u043e\u0031"
    },
    {
      "\u041f\u0435\u0440\u0435\u043c\u0435\u043d\u043d\u0430\u044f":2,
      "\u0421\u0440\u0435\u0434\u043d\u0435\u0435":2,
      "\u0421\u0442\u0440\u043e\u043a\u0430":"\u0427\u0442\u043e\u002d\u0442\u043e\u0032"
    },
    {
      "\u041f\u0435\u0440\u0435\u043c\u0435\u043d\u043d\u0430\u044f":1,
      "\u0421\u0440\u0435\u0434\u043d\u0435\u0435":42,
      "\u0421\u0442\u0440\u043e\u043a\u0430":"\u0427\u0442\u043e\u002d\u0442\u043e\u0033"
    }
  ]
}
;
run;

输出文件.json

{
"SASJSONExport":"1.0",
"SASTableData+TEST": [
{
"Переменная":2,
"Среднее":4,
"Строка":"Что-то1"
},
{
"Переменная":2,
"Среднее":2,
"Строка":"Что-то2"
},
{
"Переменная":1,
"Среднее":42,
"Строка":"Что-то3"
}
]
}

然后创建json映射文件.json.map

filename jsmf '~/sasuser.v94/.json.map' encoding='utf-8';
libname jsm json fileref=jsf map=jsmf automap=create;

.json.map的内容:

{
  "DATASETS": [
    {
      "DSNAME": "root",
      "TABLEPATH": "/root",
      "VARIABLES": [
        {
          "NAME": "ordinal_root",
          "TYPE": "ORDINAL",
          "PATH": "/root"
        },
        {
          "NAME": "SASJSONExport",
          "TYPE": "CHARACTER",
          "PATH": "/root/SASJSONExport",
          "CURRENT_LENGTH": 3
        }
      ]
    },
    {
      "DSNAME": "SASTableData_TEST",
      "TABLEPATH": "/root/SASTableData+TEST",
      "VARIABLES": [
        {
          "NAME": "ordinal_root",
          "TYPE": "ORDINAL",
          "PATH": "/root"
        },
        {
          "NAME": "ordinal_SASTableData_TEST",
          "TYPE": "ORDINAL",
          "PATH": "/root/SASTableData+TEST"
        },
        {
          "NAME": "____________________",
          "TYPE": "NUMERIC",
          "PATH": "/root/SASTableData+TEST/Переменная"
        },
        {
          "NAME": "______________",
          "TYPE": "NUMERIC",
          "PATH": "/root/SASTableData+TEST/Среднее"
        },
        {
          "NAME": "____________",
          "TYPE": "CHARACTER",
          "PATH": "/root/SASTableData+TEST/Строка",
          "CURRENT_LENGTH": 12
        }
      ]
    }
  ]
}

让我们通过删除不必要的数据集的描述并添加标签来稍微改变文件:

{
  "DATASETS": [
    {
      "DSNAME": "SASTableData_TEST",
      "TABLEPATH": "/root/SASTableData+TEST",
      "VARIABLES": [
        {
          "NAME": "ordinal_root",
          "TYPE": "ORDINAL",
          "PATH": "/root"
        },
        {
          "NAME": "ordinal_SASTableData_TEST",
          "TYPE": "ORDINAL",
          "PATH": "/root/SASTableData+TEST"
        },
        {
          "NAME": "____________________",
          "TYPE": "NUMERIC",
          "PATH": "/root/SASTableData+TEST/Переменная",
          "LABEL": "Переменная"
        },
        {
          "NAME": "______________",
          "TYPE": "NUMERIC",
          "PATH": "/root/SASTableData+TEST/Среднее",
          "LABEL": "Среднее"
        },
        {
          "NAME": "____________",
          "TYPE": "CHARACTER",
          "PATH": "/root/SASTableData+TEST/Строка",
          "LABEL": "Строка",
          "CURRENT_LENGTH": 12
        }
      ]
    }
  ]
}

然后再试一次:

libname jsd json fileref=jsf map=jsmf;
proc print data=jsd.SASTableData_TEST label noobs; run;

结果:

+--------------+---------------------------+- ----------+---------+-----------+
| ordinal_root | ordinal_SASTableData_TEST | Переменная | Среднее |    Строка |
+--------------+---------------------------+------------+---------+-----------+
|            1 |                         1 |          2 |       4 | Что-то1   |
|            1 |                         2 |          2 |       2 | Что-то2   |
|            1 |                         3 |          1 |      42 | Что-то3   |
+--------------+---------------------------+------------+---------+-----------+

所有这些都是在SAS大学版中完成的。

© www.soinside.com 2019 - 2024. All rights reserved.