Tidyverse ๐Ÿช์—์„œ Polars ๐Ÿปโ€โ„๏ธ๊นŒ์ง€: ๋‚ด ๋…ธํŠธ

R-Blogger ยท ๋ธ”๋กœ๊ทธยทํ•ด์„ค ยท 2024-12-22

R-Blogger๋ธ”๋กœ๊ทธยทํ•ด์„คํ•œ๊ตญ์–ด2024-12-22

Tidyverse ๐Ÿช์—์„œ Polars ๐Ÿปโ€โ„๏ธ๊นŒ์ง€: ๋‚ด ๋…ธํŠธ

Polars๋ฅผ ์‚ฌ์šฉํ•œ R ๊ธฐ๋ฐ˜ ํŒŒ์ด์ฌ ๋ฐ์ดํ„ฐ ๋ถ„์„ ์—ฐ์Šต ๋™๊ธฐ๋ถ€์—ฌ Polars ๊ตฌ๋ฌธ์€ dplyr๊ณผ ๋งค์šฐ ์œ ์‚ฌํ•ฉ๋‹ˆ๋‹ค. ํ•จ์ˆ˜ ์ฒด์ด๋‹ ๋ฐฉ์‹์„ ์‚ฌ์šฉํ•˜๋ฉด ์ต์ˆ™ํ•จ์„ ๋” ๋А๋‚„ ์ˆ˜ ์žˆ์—ˆ์Šต๋‹ˆ๋‹ค. ์ด์ œ ๋ฐฐ์šด ์„ธ๋ถ€ ์‚ฌํ•ญ์„ ์‹ค์ „์—์„œ ์ ์šฉํ•ด ๋ณผ ์ฐจ๋ก€์ž…๋‹ˆ๋‹ค. ํ–‰์šด์„ ๋น•๋‹ˆ๋‹ค! ๐Ÿ€ ๋ชฉํ‘œ ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„ ์ƒ์„ฑ ํ•„ํ„ฐ, ์„ ํƒ, ์š”์•ฝ, ์ „์ฒด ๋ณ€ํ˜•, ๋ฌธ์ž์—ด ๊ฒฐํ•ฉ, ์กฐ๊ฑด ๋ถ„๊ธฐ, ์กฐ์ธ, ๋”๋ฏธ/ํ”ผ๋ฒ—/์–ธํ”ผ๋ฒ— ์œ ์šฉํ•œ ์ž๋ฃŒ ๋ฐ ํ•™์Šต ๋‚ด์šฉ ์ •๋ฆฌ ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„ ์ƒ์„ฑ (tidyverse ์˜ˆ์‹œ) ๋‹ค์Œ์€ tidyverse๋ฅผ ์‚ฌ์šฉํ•œ R ์˜ˆ์‹œ์ž…๋‹ˆ๋‹ค. ์•„๋ž˜ ์ฝ”๋“œ๋ฅผ ๋ณด๋ฉฐ Polars ๋ณ€ํ™˜์„ ํ™•์ธํ•ฉ๋‹ˆ๋‹ค. library(tidyverse) library(reticulate) use_virtualenv('path/to/your/environment') df %% filter(age = 30) %% select(1:3, starts_with("performance"), starts_with("salary")) %% summarize(across(.cols = where(is.numeric), .fns = mean, .names = "mean_{.col}")) Polars ๋ณ€ํ™˜ ์˜ˆ์‹œ Polars์—์„œ๋Š” ๋‹ค์Œ๊ณผ ๊ฐ™์ด ์ž‘์„ฑํ•ฉ๋‹ˆ๋‹ค. df .filter(pl.col('age') = 30) .select(df.columns[0:4]+['^performance.*$','^salary.*$']) .select(pl.col(pl.Int64).mean().name.prefix('mean_')) ์ •๊ทœ์‹์—์„œ ^์™€ $๋ฅผ ์‚ฌ์šฉํ•ด ์ปฌ๋Ÿผ๋ช…์„ ์ •ํ™•ํžˆ ์ง€์ •ํ•ด์•ผ ํ•œ๋‹ค๋Š” ์ ์ด ๋‹ค์†Œ ์ด์ƒํ•ฉ๋‹ˆ๋‹ค. ์—ด ๋ณ€ํ™˜: ๋ฌธ์ž์—ด ๊ฒฐํ•ฉ (mutate, paste) ๋ชจ๋“  ๋ฌธ์ž์—ด ์ปฌ๋Ÿผ์„ ๊ณต๋ฐฑ์œผ๋กœ ๊ฒฐํ•ฉํ•œ ์ƒˆ ์ปฌ๋Ÿผ์„ ๋งŒ๋“ค๊ณ ์ž ํ•ฉ๋‹ˆ๋‹ค. tidyverse: df %% rowwise() %% transmute(combination_of_character = paste(across(where(is.character)), collapse = " ")) %% select(combination_of_character) Polars: df .with_columns( pl.concat_str( pl.col(pl.String), separator=" " ).alias('combination_of_character') ) .select(pl.col('combination_of_character')) ์—ด ๋ณ€ํ™˜: ๋‘ ๊ฐœ ์ปฌ๋Ÿผ ๊ฒฐํ•ฉ (glue) ์—ฐ๋ น๊ณผ ๊ธ‰์—ฌ๋ฅผ ํ•˜์ดํ”ˆ(-)์œผ๋กœ ์—ฐ๊ฒฐํ•œ ์ƒˆ ์ปฌ๋Ÿผ์„ ๋งŒ๋“ค๊ณ ์ž ํ•ฉ๋‹ˆ๋‹ค. tidyverse: df %% mutate(age_salary = paste0(age, "-", salary)) %% select(name, age_salary) Polars: df .with_columns( age_salary=pl.format('{}-{}',pl.col('age'),pl.col('salary')) ) .select(pl.col('name','age_salary')) ํ•œ ๊ฐœ ์ปฌ๋Ÿผ๋งŒ ๋งŒ๋“ค ๋•Œ๋Š” alias ์—†์ด format๋งŒ ์‚ฌ์šฉํ•ด๋„ ๋ฉ๋‹ˆ๋‹ค. ์—ด ๋ณ€ํ™˜: ์ •๊ทœ์‹ ์ถ”์ถœ (area_code_and_salary) ์ฃผ์†Œ์—์„œ ๋„๋กœ ๋ฒˆํ˜ธ๋ฅผ ์ถ”์ถœํ•ด ๊ธ‰์—ฌ์™€ ๊ฒฐํ•ฉํ•œ ์ƒˆ ์ปฌ๋Ÿผ์„ ๋งŒ๋“ค๊ณ ์ž ํ•ฉ๋‹ˆ๋‹ค. tidyverse: df %% mutate(area_code_and_salary = paste0(str_extract(address, "\\d{0,5}"), " ", salary)) %% select(area_code_and_salary) Polars: df .select( pl.concat_str( pl.col('address').str.extract(r'^(\d{0,5})'), pl.lit(" "), pl.col('salary') ).alias('area_code_and_salary') ) ์กฐ๊ฑด ๋ถ„๊ธฐ (case_when) ์ฃผ์†Œ์— ๋”ฐ๋ผ 'familiarity' ์ปฌ๋Ÿผ์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค. tidyverse: df %% mutate(familiarity = case_when( str_detect(address, "OH") ~ "local", str_detect(address, "NY") ~ "foodie", TRUE ~ "elsewhere" )) Polars: df .with_columns( pl.when(pl.col('address').str.contains('OH')).then(pl.lit('local')) .when(pl.col('address').str.contains('NY')).then(pl.lit('foodie')) .otherwise(pl.lit('elsewhere')) .alias('familiarity') ) ์ •๊ทœ์‹์œผ๋กœ ์ด๋ฉ”์ผ ์ „๋ถ€ ์ถ”์ถœ (lookโ€‘ahead / lookโ€‘back ๋ถˆ๊ฐ€) Polars์—์„œ๋Š” ์ •๊ทœ์‹์— lookโ€‘ahead(.*(?=@))๋‚˜ lookโ€‘back์„ ์‚ฌ์šฉํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค. ๊ทธ๋ฃนํ™” ๋ฐ ์‹œํ”„ํŠธ (Group_by, Shift, Forward_Fill) ๋ถ€์„œ๋ณ„๋กœ ๊ธ‰์—ฌ๋ฅผ ํ•œ ์ค„์”ฉ ์‹œํ”„ํŠธํ•˜๊ณ , ๊ฒฐ์ธก๊ฐ’์„ ์ „๋ถ€ ์ฑ„์šฐ๋Š” ์˜ˆ์‹œ์ž…๋‹ˆ๋‹ค. tidyverse: df %% group_by(department) %% summarize( name = name, salary_shift = case_when( n() == 1 ~ salary, TRUE ~ lead(salary) ) ) %% fill(salary_shift, .direction = "down") Polars (๋‹จ์ผ ๋ผ์ธ ์‹œํ”„ํŠธ์™€ ์ „๋ถ€ ์ฑ„์šฐ๊ธฐ): df .group_by('department') .agg( pl.col('name'), pl.when(pl.col('salary').len()==1).then(pl.col('salary')) .otherwise(pl.col('salary').shift(-1)) .alias('salary_shift') ) .explode('name','salary_shift') .with_columns( pl.col('salary_shift').forward_fill() ) Polars๋Š” ๊ทธ๋ฃนํ™” ์‹œ ๋ฆฌ์ŠคํŠธ ์ปฌ๋Ÿผ์„ ์ƒ์„ฑํ•˜๋ฏ€๋กœ explode() ๋กœ ํŽผ์นœ ๋’ค forward_fill() ๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค. ๋”๋ฏธ ๋ณ€ํ™˜ ๋ฐ ํ”ผ๋ฒ— (to_dummies / pivot) ๋ถ€์„œ๋ณ„ ๋”๋ฏธ ๋ณ€์ˆ˜๋ฅผ ๋งŒ๋“ค๊ณ , ํ”ผ๋ฒ—์œผ๋กœ ์ƒํƒœ๋ณ„ ๋ถ„ํฌ๋ฅผ ํ™•์ธํ•ฉ๋‹ˆ๋‹ค. tidyverse ์˜ˆ์‹œ: df %% select(name, department) %% pivot_wider( id_cols = "name", names_from = "department", values_from = "department", values_fill = 0, values_fn = length, names_prefix = "department_" ) Polars to_dummies: df .select(['name','department']) .to_dummies(columns = 'department') Polars ํ”ผ๋ฒ— ์˜ˆ์‹œ (์ƒํƒœ๋ณ„ ๋ถ„ํฌ): df .select(['name','address']) .with_columns( state = pl.col('address').str.extract(r'([A-Z]{2})$
์›๋ฌธ URL
์ „์ฒด ๊ธ€์€ ์›๋ฌธ ํŽ˜์ด์ง€์—์„œ ์ด์–ด์„œ ์ฝ์„ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
์›๋ฌธ์—์„œ ์ „์ฒด ๊ธ€ ์ฝ๊ธฐ
์ž‘์„ฑ์ž
R-Blogger
์ถœ์ฒ˜
R-Blogger
ํ”Œ๋žซํผ
R-Blogger
๋ถ„๋ฅ˜
๋ธ”๋กœ๊ทธยทํ•ด์„ค
์–ธ์–ด
ํ•œ๊ตญ์–ด
๋ฐœํ–‰์ผ
2024-12-22