我有一个与如何从极坐标数据框中的列表中选择所有列类似的问题,但略有不同:
import polars as pl
import numpy as np
import string
n = 1000
letters = list(string.ascii_letters)
uppercase = list(string.ascii_uppercase)
words, groups = [], []
for i in range(n):
word = ''.join([np.random.choice(letters) for _ in range(np.random.randint(3, 20))])
words.append(word)
group = np.random.choice(uppercase)
groups.append(group)
df = pl.DataFrame(
{
"a_0": np.linspace(0, 1, n),
"a_1": np.linspace(1, 2, n),
"a_2": np.linspace(2, 3, n),
"a_3": np.linspace(3, 4, n),
"a_4": np.linspace(4, 5, n),
"a_5": np.linspace(5, 6, n),
"a_6": np.linspace(6, 7, n),
"b_0": np.random.rand(n),
"b_1": 2*np.random.rand(n),
"b_2": 3*np.random.rand(n),
"b_3": 4*np.random.rand(n),
"b_4": 5*np.random.rand(n),
"b_5": 6*np.random.rand(n),
"b_6": 7*np.random.rand(n),
"words": words,
"groups": groups,
}
)
我想再次将列
a_0
、a_1
...连接成列 a
,将列 b_0
、b_1
...连接成列 b
。然而,与之前的问题不同的是,这次a = [a_0; a_1; ...]
。即,首先是 a_0
的所有元素,然后是 a_1
的所有元素,依此类推。名称不以 _
结尾的所有列,后跟一个数字(在本例中为 words
)和 groups
)必须重复足够多的次数以匹配 a
的长度。我怎样才能做到这一点?
尝试下面的代码,它将连接列
import polars as pl
import numpy as np
import string
n = 1000
letters = list(string.ascii_letters)
uppercase = list(string.ascii_uppercase)
words, groups = [], []
for i in range(n):
word = ''.join([np.random.choice(letters) for _ in range(np.random.randint(3, 20))])
words.append(word)
group = np.random.choice(uppercase)
groups.append(group)
df = pl.DataFrame(
{
"a_0": np.linspace(0, 1, n),
"a_1": np.linspace(1, 2, n),
"a_2": np.linspace(2, 3, n),
"a_3": np.linspace(3, 4, n),
"a_4": np.linspace(4, 5, n),
"a_5": np.linspace(5, 6, n),
"a_6": np.linspace(6, 7, n),
"b_0": np.random.rand(n),
"b_1": 2*np.random.rand(n),
"b_2": 3*np.random.rand(n),
"b_3": 4*np.random.rand(n),
"b_4": 5*np.random.rand(n),
"b_5": 6*np.random.rand(n),
"b_6": 7*np.random.rand(n),
"words": words,
"groups": groups,
}
)
# Select columns with names ending with a_ followed by a digit
a_columns = [col for col in df.columns if col.endswith("_") and col[-2:].isdigit()]
# Select columns with names not ending with a_ followed by a digit
other_columns = [col for col in df.columns if not col.endswith("_") or not col[-2:].isdigit()]
# Concatenate the selected columns into a single column
concatenated_a = df.select(a_columns).hstack()
# Duplicate other columns to match the length of concatenated_a
duplicated_other_columns = df.select(other_columns).hstack().clone().extend(concatenated_a.len())
# Join the concatenated column with duplicated other columns
result = concatenated_a.join(duplicated_other_columns)
print(result)