PyPolars,两列上的条件连接

问题描述 投票:0回答:2

如何根据左侧

pl.LazyFrame
列中的内容使用每个
pl.LazyFrame
中的两列连接两个
pl.LazyFrame

import polars as pl

lf1 = pl.LazyFrame(
    data={
        "col_1": ["a", "b", "c"],
        "col_2": ["d", None, None],
        "col_3": [None, "e", None],
    },
    schema={
        "col_1": pl.Utf8,
        "col_2": pl.Utf8,
        "col_3": pl.Utf8,
    },
)

lf2 = pl.LazyFrame(
    data={
        "col_a": ["d", "xyz"],
        "col_b": ["xyz", "e"],
        "col_c": ["relevant_info_1", "relevant_info_2"],
    },
    schema={
        "col_a": pl.Utf8,
        "col_b": pl.Utf8,
        "col_c": pl.Utf8,
    },
)  

所需加入的伪代码:

lf1.join(lf2,
  when(col("col_2").isnotnull().then(left_on="col_2", right_on="col_a")
  when(col("col_3").isnotnull().then(left_on="col_3", right_on="col_b")
  otherwise(do_nothing)
)

预期结果:

col_1 | col_2 | col_3 | col_c
a     | d     | None  | relevant_info_1
b     | None  | e     | relevant_info_2
c     | None  | None  | None
python dataframe left-join python-polars
2个回答
2
投票

这里有一个方法。首先我们分别进行两个连接:

join1 = lf1.join(lf2, left_on=["col_2"], right_on=["col_a"], how="left").collect()
join2 = lf1.join(lf2, left_on=["col_3"], right_on=["col_b"], how="left").collect()

然后可以将

join1
join2
连接起来,并通过合并
col_c
col_c
中的
join1
来制作所需的
join2
。 我只保留您要求的最后一栏,但删除
select
声明可能会有所帮助。

(
    join1
    .join(
        join2.select(['col_1', 'col_c'])
        , on=["col_1"]
    )
    .with_columns(pl.coalesce(['col_c', 'col_c_right']).alias('col_c'))
    .select(['col_1', 'col_2', 'col_3', 'col_c'])
)

0
投票
import polars as pl

LF1 = pl.LazyFrame(
    data={
        "col_1": ["a", "b", "c"],
        "col_2": ["d", None, None],
        "col_3": [None, "e", None],
    },
    schema={
        "col_1": pl.Utf8,
        "col_2": pl.Utf8,
        "col_3": pl.Utf8,
    },
)

LF2 = pl.LazyFrame(
    data={
        "col_a": ["d", "xyz"],
        "col_b": ["xyz", "e"],
        "col_c": ["relevant_info_1", "relevant_info_2"],
    },
    schema={
        "col_a": pl.Utf8,
        "col_b": pl.Utf8,
        "col_c": pl.Utf8,
    },
)


def foo1() -> pl.LazyFrame:
    lf_joined_on_col_2 = (
        LF1.join(other=LF2, left_on=["col_2"], right_on=["col_a"])
        .with_columns(pl.lit(None).cast(pl.Utf8).alias("col_a"))
        .select(["col_1", "col_2", "col_3", "col_c"])
    )
    lf_joined_on_col_3 = (
        LF1.join(other=LF2, left_on=["col_3"], right_on=["col_b"])
        .with_columns(pl.lit(None).cast(pl.Utf8).alias("col_b"))
        .select(["col_1", "col_2", "col_3", "col_c"])
    )

    lf_rows_with_null_on_col_2_and_col_3 = LF1.filter(
        pl.col("col_2").is_null() & pl.col("col_3").is_null()
    ).with_columns(pl.lit(None).cast(pl.Utf8).alias("col_c"))

    return pl.concat(
        [lf_joined_on_col_2, lf_joined_on_col_3, lf_rows_with_null_on_col_2_and_col_3]
    )


def foo2():
    lf_joined_on_col_2 = LF1.join(
        other=LF2, left_on=["col_2"], right_on=["col_a"], how="left"
    )
    lf_joined_on_col_3 = LF1.join(
        other=LF2, left_on=["col_3"], right_on=["col_b"], how="left"
    )

    return (
        lf_joined_on_col_2.join(
            lf_joined_on_col_3.select(["col_1", "col_c"]), on=["col_1"]
        )
        .with_columns(pl.coalesce(["col_c", "col_c_right"]).alias("col_c"))
        .select(["col_1", "col_2", "col_3", "col_c"])
    )
@robertdj 提供的解决方案

foo2()

 速度是 
foo1()
 的两倍 :

> python -m timeit -n 10000 -s "import test1" "test1.foo1()" 10000 loops, best of 5: 116 usec per loop > python -m timeit -n 10000 -s "import test1" "test1.foo2()" 10000 loops, best of 5: 65 usec per loop
    
© www.soinside.com 2019 - 2024. All rights reserved.