Skip to content

Commit 64a2d2a

Browse files
committed
null处理提供选项
1 parent d231fad commit 64a2d2a

File tree

5 files changed

+39
-14
lines changed

5 files changed

+39
-14
lines changed

README.md

+8-3
Original file line numberDiff line numberDiff line change
@@ -140,15 +140,20 @@ df = codegen_exec(df.lazy(), _code_block_1, _code_block_2).collect(engine="gpu")
140140
https://github.com/pola-rs/polars/issues/12925#issuecomment-2552764629
141141
非常棒的点子,总结下来有两种实现方式:
142142

143-
1.`null`分成一组,`not_null`分成另一组要计算两次
144-
2. 仅一组,但复合排序,将`null`排在前面,`not_null`排后面,只要计算一次
143+
1.`null`分成一组,`not_null`分成另一组要计算两次
144+
2. 仅一组,但复合排序,将`null`排在前面,`not_null`排后面。只计算一次,略快一些
145145

146146
```python
147147
X1 = (ts_returns(CLOSE, 3)).over(CLOSE.is_not_null(), _ASSET_, order_by=_DATE_),
148148
X2 = (ts_returns(CLOSE, 3)).over(_ASSET_, order_by=[CLOSE.is_not_null(), _DATE_]),
149+
X3 = (ts_returns(CLOSE, 3)).over(_ASSET_, order_by=_DATE_),
149150
```
150151

151-
目前使用的是第2种
152+
第2种开头的`null`区域,是否影响结果由算子所决定,特别时是多列输入`null`区域可能有数据
153+
154+
1. `over_null='partition_by'`。分到两个区域
155+
2. `over_null='order_by'`。分到一个区域,`null`排在前面
156+
3. `over_null=None`。不处理,直接计算,速度更快
152157

153158
## 二次开发
154159

expr_codegen/pandas/code.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,8 @@ def codegen(exprs_ldl: ListDictList, exprs_src, syms_dst,
3838
filename='template.py.j2',
3939
date='date', asset='asset',
4040
alias: Dict[str, str] = {},
41-
extra_codes: Sequence[str] = ()):
41+
extra_codes: Sequence[str] = (),
42+
**kwargs):
4243
"""基于模板的代码生成"""
4344
# 打印Pandas风格代码
4445
p = PandasStrPrinter()

expr_codegen/polars_group/code.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@ def codegen(exprs_ldl: ListDictList, exprs_src, syms_dst,
3939
filename='template.py.j2',
4040
date='date', asset='asset',
4141
alias: Dict[str, str] = {},
42-
extra_codes: Sequence[str] = ()):
42+
extra_codes: Sequence[str] = (),
43+
**kwargs):
4344
"""基于模板的代码生成"""
4445
# 打印Polars风格代码
4546
p = PolarsStrPrinter()

expr_codegen/polars_over/code.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import os
2-
from typing import Sequence, Dict
2+
from typing import Sequence, Dict, Literal
33

44
import jinja2
55
from jinja2 import FileSystemLoader, TemplateNotFound
@@ -39,7 +39,9 @@ def codegen(exprs_ldl: ListDictList, exprs_src, syms_dst,
3939
filename='template.py.j2',
4040
date='date', asset='asset',
4141
alias: Dict[str, str] = {},
42-
extra_codes: Sequence[str] = ()):
42+
extra_codes: Sequence[str] = (),
43+
over_null: Literal['order_by', 'partition_by', None] = 'partition_by',
44+
**kwargs):
4345
"""基于模板的代码生成"""
4446
# 打印Polars风格代码
4547
p = PolarsStrPrinter()
@@ -82,8 +84,12 @@ def codegen(exprs_ldl: ListDictList, exprs_src, syms_dst,
8284
_sym = f"pl.all_horizontal({','.join(_sym)})"
8385
else:
8486
_sym = ','.join(_sym)
85-
# func_code.append(f"{va}=({s2}).over({_sym}, _ASSET_, order_by=_DATE_),")
86-
func_code.append(f"{va}=({s2}).over(_ASSET_, order_by=[{_sym}, _DATE_]),")
87+
if over_null == 'partition_by':
88+
func_code.append(f"{va}=({s2}).over({_sym}, _ASSET_, order_by=_DATE_),")
89+
elif over_null == 'order_by':
90+
func_code.append(f"{va}=({s2}).over(_ASSET_, order_by=[{_sym}, _DATE_]),")
91+
else:
92+
func_code.append(f"{va}=({s2}).over(_ASSET_, order_by=_DATE_),")
8793
elif k[0] == CS:
8894
func_code.append(f"{va}=({s2}).over(_DATE_),")
8995
elif k[0] == GP:

expr_codegen/tool.py

+17-5
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,8 @@ def all(self, exprs_src, style: Literal['pandas', 'polars_group', 'polars_over']
193193
replace: bool = True, regroup: bool = False, format: bool = True,
194194
date='date', asset='asset',
195195
alias: Dict[str, str] = {},
196-
extra_codes: Sequence[object] = ()):
196+
extra_codes: Sequence[object] = (),
197+
**kwargs):
197198
"""功能集成版,将几个功能写到一起方便使用
198199
199200
Parameters
@@ -252,7 +253,8 @@ def all(self, exprs_src, style: Literal['pandas', 'polars_group', 'polars_over']
252253
codes = codegen(exprs_ldl, exprs_src, syms_dst,
253254
filename=template_file, date=date, asset=asset,
254255
alias=alias,
255-
extra_codes=extra_codes)
256+
extra_codes=extra_codes,
257+
**kwargs)
256258

257259
if format:
258260
# 格式化。在遗传算法中没有必要
@@ -267,7 +269,8 @@ def _get_code(self,
267269
output_file: str,
268270
convert_xor: bool,
269271
style: Literal['pandas', 'polars_group', 'polars_over'] = 'polars_over', template_file: str = 'template.py.j2',
270-
date: str = 'date', asset: str = 'asset') -> str:
272+
date: str = 'date', asset: str = 'asset',
273+
**kwargs) -> str:
271274
"""通过字符串生成代码, 加了缓存,多次调用不重复生成"""
272275
raw, exprs_dict = sources_to_exprs(self.globals_, source, *more_sources, convert_xor=convert_xor)
273276

@@ -279,7 +282,8 @@ def _get_code(self,
279282
extra_codes=(raw,
280283
# 传入多个列的方法
281284
extra_codes,
282-
))
285+
),
286+
**kwargs)
283287

284288
# 移回到cache,防止多次调用多次保存
285289
if isinstance(output_file, TextIOWrapper):
@@ -324,7 +328,8 @@ def codegen_exec(df: Optional[DataFrame],
324328
style: Literal['pandas', 'polars_group', 'polars_over'] = 'polars_over',
325329
template_file: str = 'template.py.j2',
326330
date: str = 'date', asset: str = 'asset',
327-
) -> Optional[DataFrame]:
331+
over_null: Literal['order_by', 'partition_by', None] = 'partition_by',
332+
**kwargs) -> Optional[DataFrame]:
328333
"""快速转换源代码并执行
329334
330335
Parameters
@@ -355,6 +360,11 @@ def codegen_exec(df: Optional[DataFrame],
355360
时间字段
356361
asset: str
357362
资产字段
363+
over_null: str
364+
时序中遇到null时的处理方式
365+
- order_by: 空值排同一分区的前排
366+
- partition_by: 空值划分到不同分区
367+
- None: 不做处理
358368
359369
Returns
360370
-------
@@ -391,6 +401,8 @@ def codegen_exec(df: Optional[DataFrame],
391401
convert_xor=convert_xor,
392402
style=style, template_file=template_file,
393403
date=date, asset=asset,
404+
over_null=over_null,
405+
**kwargs
394406
)
395407

396408
if df is None:

0 commit comments

Comments
 (0)