2026
- 目標 2026-05-02
pythonでcsvの読み込み速度調査を行う。¶
- pandas
- polars
23G程度だとpolars一択
Python
import polars as pl
from pathlib import Path
def save_parquet_by_date(
input_path: str,
output_dir: str,
datetime_col: str = "pickup_datetime",
):
"""
CSV / Parquet を読み込み、日付ごとにParquet分割保存
input_path: 入力(CSV or Parquet, ワイルドカード可)
output_dir: 出力フォルダ
datetime_col: 日時カラム名
"""
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
# CSV or Parquet 自動判定
if input_path.endswith(".csv") or "*" in input_path:
df = pl.scan_csv(input_path)
else:
df = pl.scan_parquet(input_path)
# datetime変換 + 日付列作成
df = df.with_columns(
pl.col(datetime_col).str.to_datetime(),
pl.col(datetime_col).str.to_datetime().dt.date().alias("date"),
)
# 日付ごとに分割して保存
df.sink_parquet(
output_dir,
partition_by="date",
compression="zstd",
)
print(f"保存完了: {output_dir}")
Python
import polars as pl
def main():
df = pl.scan_parquet("hoge_polars.parquet")
result = (
df.with_columns(
pl.col("pickup_datetime").str.to_datetime()
)
.filter(
(pl.col("pickup_datetime") >= pl.datetime(2026, 10, 13, 9, 0, 0)) &
(pl.col("pickup_datetime") < pl.datetime(2026, 10, 13, 10, 0, 0))
)
.collect()
)
print(result.shape)
print(result.head())
if __name__ == "__main__":
main()
Text Only
import polars as pl
date = (2010, 10, 13)
time1 = (9, 30)
time2 = (15, 20)
start = pl.datetime(*date, *time1)
end = pl.datetime(*date, *time2)
result = (
df.with_columns(
pl.col("pickup_datetime").str.to_datetime()
)
.filter(
(pl.col("pickup_datetime") >= start) &
(pl.col("pickup_datetime") < end)
)
.collect()
)
print(result.shape)
print(result.head())