从文件中字符的索引获取行号

Question

我有一个由单词组成的字符串输入。我正在使用 regex.exec (g) 通过函数获取所有单词

getWord(input)

所以我的输入可能如下所示：


word word2
someword blah

我从 exec 得到的是包含匹配的

index

的对象。所以它是数组：


[ 'word', index: 0, input: "..."]
...
[ 'someword', index: 11, input: "..."]
...

我需要的是使用索引（11）轻松计算单词“someword”在第2行（因为我没有任何其他值告诉我行数是多少）

这就是我想出的：匹配' 直到你匹配为止索引较高的则是单词的索引。不确定这在 10k 行文件中是否不会出现问题。

想法片段：

getLineFromIndex: (index, input) ->
  regex = /\n/g
  line = 1

  loop
    match = regex.exec(input)
    break if not match? or match.index > index

    line++

  return line

这里可以进行很大的优化。我可以保存正则表达式和最后一个匹配项，因此每次要检查行号时我不会迭代所有输入。仅当最后一个匹配的索引低于当前索引时，才会执行正则表达式。

这是优化后的最终想法：

  ###
    @variable content [String] is input content
  ###
  getLineFromIndex: (index) ->
    @lineMatcher = @lineMatcher || /\n/g
    @lastLine = @lastLine || 1

    if @eof isnt true
      @lastMatch = @lastMatch || @lineMatcher.exec(@content)

    if @eof or index < @lastMatch.index
      return @lastLine
    else
      match = @lineMatcher.exec(@content)
      if not @eof and match is null
        @eof = true
      else
        @lastMatch = match

      @lastLine++

    return @lastLine

Answer 1

剪切输入 (a.substr(0, 11))。
拆分它 (a.substr(0, 11).split(' '））。
数一下 (a.substr(0, 11).split(' ').长度).

Answer 2

你的伪代码似乎可以完成这项工作。但我不明白如何通过搜索单词的偏移量推断行号。我将按行分割输入文本，然后在数组中查找搜索到的单词，如果找到则返回行索引。

var input= "word word2 \n"+
           "someword blah";


function getLinesNumberOf( input, word){
  var line_numbers=[];
  input.split("\n").forEach(function(line, index){
    if( line.indexOf(word)>=0 ) line_numbers.push(index);
  });
  return line_numbers;
}


console.log(getLinesNumberOf(input,"someword"));

我添加了对搜索词多次出现的支持。

编辑

为了避免大量输入消耗太多内存，您可以按顺序解析（对于 SAX 与 DOM 的相同优点）：

function getLinesNumberOf( word, input ){

    input+= "\n";//make sure to not miss the last line;

    var line_numbers=[], current_line=0;
    var startline_offset=0;

    do{
        //get the offset next of the next breakline 
        endline_offset= input.indexOf("\n",startline_offset);

        //get the offset of the searched word in the line 
        word_offset= input.substring(startline_offset,endline_offset).indexOf(word, 0);

        //check if the searched word has been found and if it has been found on current_line
        if( word_offset >= 0 && word_offset < endline_offset ) {
            //if true the current_line is stored
            line_numbers.push(current_line);
        }

        //the offset of the next line is just after the breakline offset  
        startline_offset= endline_offset+1;

        current_line++;

    }while(endline_offset>=0);//we continue while a breakline is found

    console.log(line_numbers);
}

Answer 3

gwer最初提出的解决方案是这样的：

function getLineNumber(text, index) {
  return text.slice(0, index).split('\n').length;
}

但是还有更快的解决方案：

function getLineNumber(text, index) {
  const match = text.slice(0, index).match(/\n/g);
  return (match ? match.length : 0) + 1;
}

根据我有限的基准测试，这是最快的，并且也应该使用最少的内存，因为它根本不操作输入文本：

function getLineNumberB(text, index) {
  let line = 1;
  for (let i = 0; i < index; i++) {
    if (text[i] === '\n') {
      line++;
    }
  }
  return line;
}

如果您想处理不同的可能行结尾，您可以预处理文本（推荐）：

text = text.replace(/\r\n|\r/g, '\n');

或者您可以使用这些更复杂的解决方案：

function getLineNumber(text, index) {
  const match = text.slice(0, index).match(/\r\n|\r|\n/g);
  return (match ? match.length : 0) + 1;
}

function getLineNumber(text, index) {
    let line = 1;
    for (let i = 0; i < index; i++) {
      if (text[i] === '\n') {
        line++;
      }
      if (text[i] === '\r') {
        // A line feed after a carriage return counts as part of the same newline
        if (text[i + 1] === '\n') {
          i++;
        }
        line++;
      }
    }
    return line;
  }

从文件中字符的索引获取行号

问题描述投票：0回答：3

3个回答

最新问题

从文件中字符的索引获取行号

问题描述 投票：0回答：3

3个回答

最新问题

问题描述投票：0回答：3