如何将单词列表读入包含单词位置的元组?

问题描述 投票:0回答:3

我有一个文本文件,我想编写一个函数来读取该文件并返回一个元组列表,其中每个元组将包含字符串形式的单词、int 形式的单词行号以及最后一个字符的位置作为 int 的单词。输入示例,

example of the first line         
followed by the second line

输出示例:

[
  ("example",1,8);
  ("of",1,11);
  ("the",1,15);
  ("first",1,21);
  ("line",1,26);
  ("followed",2,13);
  ("by",2,16);
  ("the",2,20);
  ("second",2,27);
  ("line",2,32)
]
string algorithm ocaml
3个回答
1
投票

您正在寻找的功能看起来像这样,

let read filename =
  In_channel.read_lines filename |>
  List.mapi ~f:(fun line data ->
      String.split data ~on:' ' |>
      List.fold_map ~init:0 ~f:(fun pos word ->
          let pos = pos + String.length word in
          pos+1, (word,line+1,pos-1)) |>
      snd) |>
  List.concat

以下是如何使用它。首先安装依赖项,

opam install dune stdio merlin

接下来,设置您的项目,

dune init exe readlines --libs=base,stdio

然后在您最喜欢的编辑器中打开

readlines.ml
并将其内容替换为以下内容,

open Base
open Stdio

let read filename =
  In_channel.read_lines filename |>
  List.mapi ~f:(fun line data ->
      String.split data ~on:' ' |>
      List.fold_map ~init:0 ~f:(fun pos word ->
          let pos = pos + String.length word in
          pos+1, (word,line+1,pos-1)) |>
      snd) |>
  List.concat

let print =
  List.iter ~f:(fun (line,data,pos) ->
      printf "(%s,%d,%d)\n" line data pos)

let main filename =
  print (read filename)

let () = match Sys.get_argv () with
  | [|_; filename|] -> main filename
  | _ -> failwith "expects one argument: filename"

要运行和测试,请创建一个示例输入,例如名为

test.txt

的文件
example of the first line
followed by the second line

(确保最后一行后面有换行符)

现在你可以运行它了,

dune exec ./readlines.exe test.txt

结果应该如下,

(example,1,6)
(of,1,9)
(the,1,13)
(first,1,19)
(line,1,24)
(followed,2,7)
(by,2,10)
(the,2,14)
(second,2,21)
(line,2,26)

(注意,我是从 0 开始计算位置,而不是从 1 开始)。

您也可以在 utop 中交互运行此代码,但您需要安装

base
stdio
并将它们加载到解释器中,使用

#require "base";;
#require "stdio";;

如果您不使用

utop
而是使用默认的 OCaml 顶层,则还需要安装 ocamlfind (
opam install ocamlfind
) 并执行

#use "topfind";;
#require "base";;
#require "stdio";;

0
投票

如果您只想使用标准库作为字符串,您可以使用

String.split_on_char
以及应用于每行的其他一些内容来执行您想要的操作。
这是一个关于如何获得第一个留置权的示例

let ic = open_in (*your file name*) in 
let first_line = input_line ic in 
let words = String.split_on_char ' ' first_line in
let rec aux accLen = 
  function
  | [] -> []
  | s :: ts -> 
    match s with
    (* empty string means that their were a white space before the split *)
       | "" -> aux (accLen +1) ts
       | s -> let l = accLen + String.length s in (1, s, l) :: aux l ts
in aux 0 words;;

正如ivg所说,您可以用

List.fold_left
替换aux功能:

let ic = open_in (*your file name*) in 
let first_line = input_line ic in 
let words = String.split_on_char ' ' first_line in
let _, l = List.fold_left (
   fun (accLen, accRes) -> 
       function 
       | "" -> (accLen+1, accRes)
       | s -> let l = accLen + String.length s in (l, (1, s, l) :: accRes)
   ) (0, []) words
in List.rev l;;

0
投票

不包括文件 I/O 组件,但可以正确处理单词之间的多个空格,包括制表符。使用

fold_left
来娱乐新的 OCaml 程序员。

let words_with_last_index line =
  line ^ " "
  |> String.to_seqi
  |> Seq.fold_left 
       (fun (wspace, cur_word, words) (cur_pos, cur_ch) ->
          match cur_ch with
          | (' ' | '\t') when wspace || cur_pos = 0 -> (true, cur_word, words)
          | ' ' | '\t' -> (true, "", words @ [(cur_word, cur_pos - 1)])
          | ch -> (false, cur_word ^ String.make 1 ch, words))
       (false, "", [])
  |> (fun (_, _, collection) -> collection)
  
let parse_lines text =
  text
  |> String.split_on_char '\n'
  |> List.mapi 
       (fun i line -> 
          line 
          |> words_with_last_index
          |> List.map (fun (word, pos) -> (word, i + 1, pos)))
  |> List.flatten
© www.soinside.com 2019 - 2024. All rights reserved.