-
Notifications
You must be signed in to change notification settings - Fork 230
/
Copy pathsr.py
2371 lines (2174 loc) · 97.3 KB
/
sr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""Define the PySRRegressor scikit-learn interface."""
import copy
import os
import pickle as pkl
import re
import shutil
import sys
import tempfile
import warnings
from datetime import datetime
from io import StringIO
from multiprocessing import cpu_count
from pathlib import Path
from typing import Callable, Dict, List, Optional, Tuple, Union
if sys.version_info >= (3, 8):
from typing import Literal
else:
from typing_extensions import Literal
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin
from sklearn.utils import check_array, check_consistent_length, check_random_state
from sklearn.utils.validation import _check_feature_names_in, check_is_fitted
from .denoising import denoise, multi_denoise
from .deprecated import make_deprecated_kwargs_for_pysr_regressor
from .export_jax import sympy2jax
from .export_latex import sympy2latex, sympy2latextable, sympy2multilatextable
from .export_numpy import sympy2numpy
from .export_sympy import assert_valid_sympy_symbol, create_sympy_symbols, pysr2sympy
from .export_torch import sympy2torch
from .feature_selection import run_feature_selection
from .julia_helpers import (
_escape_filename,
_load_backend,
_load_cluster_manager,
_process_julia_project,
_update_julia_project,
init_julia,
is_julia_version_greater_eq,
)
from .utils import (
_csv_filename_to_pkl_filename,
_preprocess_julia_floats,
_safe_check_feature_names_in,
_subscriptify,
)
Main = None # TODO: Rename to more descriptive name like "julia_runtime"
already_ran = False
def _process_constraints(binary_operators, unary_operators, constraints):
constraints = constraints.copy()
for op in unary_operators:
if op not in constraints:
constraints[op] = -1
for op in binary_operators:
if op not in constraints:
if op in ["^", "pow"]:
# Warn user that they should set up constraints
warnings.warn(
"You are using the `^` operator, but have not set up `constraints` for it. "
"This may lead to overly complex expressions. "
"One typical constraint is to use `constraints={..., '^': (-1, 1)}`, which "
"will allow arbitrary-complexity base (-1) but only powers such as "
"a constant or variable (1). "
"For more tips, please see https://astroautomata.com/PySR/tuning/"
)
constraints[op] = (-1, -1)
if op in ["plus", "sub", "+", "-"]:
if constraints[op][0] != constraints[op][1]:
raise NotImplementedError(
"You need equal constraints on both sides for - and +, "
"due to simplification strategies."
)
elif op in ["mult", "*"]:
# Make sure the complex expression is in the left side.
if constraints[op][0] == -1:
continue
if constraints[op][1] == -1 or constraints[op][0] < constraints[op][1]:
constraints[op][0], constraints[op][1] = (
constraints[op][1],
constraints[op][0],
)
return constraints
def _maybe_create_inline_operators(
binary_operators, unary_operators, extra_sympy_mappings
):
global Main
binary_operators = binary_operators.copy()
unary_operators = unary_operators.copy()
for op_list in [binary_operators, unary_operators]:
for i, op in enumerate(op_list):
is_user_defined_operator = "(" in op
if is_user_defined_operator:
Main.eval(op)
# Cut off from the first non-alphanumeric char:
first_non_char = [j for j, char in enumerate(op) if char == "("][0]
function_name = op[:first_non_char]
# Assert that function_name only contains
# alphabetical characters, numbers,
# and underscores:
if not re.match(r"^[a-zA-Z0-9_]+$", function_name):
raise ValueError(
f"Invalid function name {function_name}. "
"Only alphanumeric characters, numbers, "
"and underscores are allowed."
)
if (extra_sympy_mappings is None) or (
not function_name in extra_sympy_mappings
):
raise ValueError(
f"Custom function {function_name} is not defined in `extra_sympy_mappings`. "
"You can define it with, "
"e.g., `model.set_params(extra_sympy_mappings={'inv': lambda x: 1/x})`, where "
"`lambda x: 1/x` is a valid SymPy function defining the operator. "
"You can also define these at initialization time."
)
op_list[i] = function_name
return binary_operators, unary_operators
def _check_assertions(
X,
use_custom_variable_names,
variable_names,
weights,
y,
X_units,
y_units,
):
# Check for potential errors before they happen
assert len(X.shape) == 2
assert len(y.shape) in [1, 2]
assert X.shape[0] == y.shape[0]
if weights is not None:
assert weights.shape == y.shape
assert X.shape[0] == weights.shape[0]
if use_custom_variable_names:
assert len(variable_names) == X.shape[1]
# Check none of the variable names are function names:
for var_name in variable_names:
# Check if alphanumeric only:
if not re.match(r"^[₀₁₂₃₄₅₆₇₈₉a-zA-Z0-9_]+$", var_name):
raise ValueError(
f"Invalid variable name {var_name}. "
"Only alphanumeric characters, numbers, "
"and underscores are allowed."
)
assert_valid_sympy_symbol(var_name)
if X_units is not None and len(X_units) != X.shape[1]:
raise ValueError(
"The number of units in `X_units` must equal the number of features in `X`."
)
if y_units is not None:
good_y_units = False
if isinstance(y_units, list):
if len(y.shape) == 1:
good_y_units = len(y_units) == 1
else:
good_y_units = len(y_units) == y.shape[1]
else:
good_y_units = len(y.shape) == 1 or y.shape[1] == 1
if not good_y_units:
raise ValueError(
"The number of units in `y_units` must equal the number of output features in `y`."
)
# Class validation constants
VALID_OPTIMIZER_ALGORITHMS = ["BFGS", "NelderMead"]
class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
"""
High-performance symbolic regression algorithm.
This is the scikit-learn interface for SymbolicRegression.jl.
This model will automatically search for equations which fit
a given dataset subject to a particular loss and set of
constraints.
Most default parameters have been tuned over several example equations,
but you should adjust `niterations`, `binary_operators`, `unary_operators`
to your requirements. You can view more detailed explanations of the options
on the [options page](https://astroautomata.com/PySR/options) of the
documentation.
Parameters
----------
model_selection : str
Model selection criterion when selecting a final expression from
the list of best expression at each complexity.
Can be `'accuracy'`, `'best'`, or `'score'`. Default is `'best'`.
`'accuracy'` selects the candidate model with the lowest loss
(highest accuracy).
`'score'` selects the candidate model with the highest score.
Score is defined as the negated derivative of the log-loss with
respect to complexity - if an expression has a much better
loss at a slightly higher complexity, it is preferred.
`'best'` selects the candidate model with the highest score
among expressions with a loss better than at least 1.5x the
most accurate model.
binary_operators : list[str]
List of strings for binary operators used in the search.
See the [operators page](https://astroautomata.com/PySR/operators/)
for more details.
Default is `["+", "-", "*", "/"]`.
unary_operators : list[str]
Operators which only take a single scalar as input.
For example, `"cos"` or `"exp"`.
Default is `None`.
niterations : int
Number of iterations of the algorithm to run. The best
equations are printed and migrate between populations at the
end of each iteration.
Default is `40`.
populations : int
Number of populations running.
Default is `15`.
population_size : int
Number of individuals in each population.
Default is `33`.
max_evals : int
Limits the total number of evaluations of expressions to
this number. Default is `None`.
maxsize : int
Max complexity of an equation. Default is `20`.
maxdepth : int
Max depth of an equation. You can use both `maxsize` and
`maxdepth`. `maxdepth` is by default not used.
Default is `None`.
warmup_maxsize_by : float
Whether to slowly increase max size from a small number up to
the maxsize (if greater than 0). If greater than 0, says the
fraction of training time at which the current maxsize will
reach the user-passed maxsize.
Default is `0.0`.
timeout_in_seconds : float
Make the search return early once this many seconds have passed.
Default is `None`.
constraints : dict[str, int | tuple[int,int]]
Dictionary of int (unary) or 2-tuples (binary), this enforces
maxsize constraints on the individual arguments of operators.
E.g., `'pow': (-1, 1)` says that power laws can have any
complexity left argument, but only 1 complexity in the right
argument. Use this to force more interpretable solutions.
Default is `None`.
nested_constraints : dict[str, dict]
Specifies how many times a combination of operators can be
nested. For example, `{"sin": {"cos": 0}}, "cos": {"cos": 2}}`
specifies that `cos` may never appear within a `sin`, but `sin`
can be nested with itself an unlimited number of times. The
second term specifies that `cos` can be nested up to 2 times
within a `cos`, so that `cos(cos(cos(x)))` is allowed
(as well as any combination of `+` or `-` within it), but
`cos(cos(cos(cos(x))))` is not allowed. When an operator is not
specified, it is assumed that it can be nested an unlimited
number of times. This requires that there is no operator which
is used both in the unary operators and the binary operators
(e.g., `-` could be both subtract, and negation). For binary
operators, you only need to provide a single number: both
arguments are treated the same way, and the max of each
argument is constrained.
Default is `None`.
loss : str
String of Julia code specifying an elementwise loss function.
Can either be a loss from LossFunctions.jl, or your own loss
written as a function. Examples of custom written losses include:
`myloss(x, y) = abs(x-y)` for non-weighted, or
`myloss(x, y, w) = w*abs(x-y)` for weighted.
The included losses include:
Regression: `LPDistLoss{P}()`, `L1DistLoss()`,
`L2DistLoss()` (mean square), `LogitDistLoss()`,
`HuberLoss(d)`, `L1EpsilonInsLoss(ϵ)`, `L2EpsilonInsLoss(ϵ)`,
`PeriodicLoss(c)`, `QuantileLoss(τ)`.
Classification: `ZeroOneLoss()`, `PerceptronLoss()`,
`L1HingeLoss()`, `SmoothedL1HingeLoss(γ)`,
`ModifiedHuberLoss()`, `L2MarginLoss()`, `ExpLoss()`,
`SigmoidLoss()`, `DWDMarginLoss(q)`.
Default is `"L2DistLoss()"`.
full_objective : str
Alternatively, you can specify the full objective function as
a snippet of Julia code, including any sort of custom evaluation
(including symbolic manipulations beforehand), and any sort
of loss function or regularizations. The default `full_objective`
used in SymbolicRegression.jl is roughly equal to:
```julia
function eval_loss(tree, dataset::Dataset{T,L}, options)::L where {T,L}
prediction, flag = eval_tree_array(tree, dataset.X, options)
if !flag
return L(Inf)
end
return sum((prediction .- dataset.y) .^ 2) / dataset.n
end
```
where the example elementwise loss is mean-squared error.
You may pass a function with the same arguments as this (note
that the name of the function doesn't matter). Here,
both `prediction` and `dataset.y` are 1D arrays of length `dataset.n`.
If using `batching`, then you should add an
`idx` argument to the function, which is `nothing`
for non-batched, and a 1D array of indices for batched.
Default is `None`.
complexity_of_operators : dict[str, float]
If you would like to use a complexity other than 1 for an
operator, specify the complexity here. For example,
`{"sin": 2, "+": 1}` would give a complexity of 2 for each use
of the `sin` operator, and a complexity of 1 for each use of
the `+` operator (which is the default). You may specify real
numbers for a complexity, and the total complexity of a tree
will be rounded to the nearest integer after computing.
Default is `None`.
complexity_of_constants : float
Complexity of constants. Default is `1`.
complexity_of_variables : float
Complexity of variables. Default is `1`.
parsimony : float
Multiplicative factor for how much to punish complexity.
Default is `0.0032`.
dimensional_constraint_penalty : float
Additive penalty for if dimensional analysis of an expression fails.
By default, this is `1000.0`.
use_frequency : bool
Whether to measure the frequency of complexities, and use that
instead of parsimony to explore equation space. Will naturally
find equations of all complexities.
Default is `True`.
use_frequency_in_tournament : bool
Whether to use the frequency mentioned above in the tournament,
rather than just the simulated annealing.
Default is `True`.
adaptive_parsimony_scaling : float
If the adaptive parsimony strategy (`use_frequency` and
`use_frequency_in_tournament`), this is how much to (exponentially)
weight the contribution. If you find that the search is only optimizing
the most complex expressions while the simpler expressions remain stagnant,
you should increase this value.
Default is `20.0`.
alpha : float
Initial temperature for simulated annealing
(requires `annealing` to be `True`).
Default is `0.1`.
annealing : bool
Whether to use annealing. Default is `False`.
early_stop_condition : float | str
Stop the search early if this loss is reached. You may also
pass a string containing a Julia function which
takes a loss and complexity as input, for example:
`"f(loss, complexity) = (loss < 0.1) && (complexity < 10)"`.
Default is `None`.
ncyclesperiteration : int
Number of total mutations to run, per 10 samples of the
population, per iteration.
Default is `550`.
fraction_replaced : float
How much of population to replace with migrating equations from
other populations.
Default is `0.000364`.
fraction_replaced_hof : float
How much of population to replace with migrating equations from
hall of fame. Default is `0.035`.
weight_add_node : float
Relative likelihood for mutation to add a node.
Default is `0.79`.
weight_insert_node : float
Relative likelihood for mutation to insert a node.
Default is `5.1`.
weight_delete_node : float
Relative likelihood for mutation to delete a node.
Default is `1.7`.
weight_do_nothing : float
Relative likelihood for mutation to leave the individual.
Default is `0.21`.
weight_mutate_constant : float
Relative likelihood for mutation to change the constant slightly
in a random direction.
Default is `0.048`.
weight_mutate_operator : float
Relative likelihood for mutation to swap an operator.
Default is `0.47`.
weight_swap_operands : float
Relative likehood for swapping operands in binary operators.
Default is `0.0`.
weight_randomize : float
Relative likelihood for mutation to completely delete and then
randomly generate the equation
Default is `0.00023`.
weight_simplify : float
Relative likelihood for mutation to simplify constant parts by evaluation
Default is `0.0020`.
weight_optimize: float
Constant optimization can also be performed as a mutation, in addition to
the normal strategy controlled by `optimize_probability` which happens
every iteration. Using it as a mutation is useful if you want to use
a large `ncyclesperiteration`, and may not optimize very often.
Default is `0.0`.
crossover_probability : float
Absolute probability of crossover-type genetic operation, instead of a mutation.
Default is `0.066`.
skip_mutation_failures : bool
Whether to skip mutation and crossover failures, rather than
simply re-sampling the current member.
Default is `True`.
migration : bool
Whether to migrate. Default is `True`.
hof_migration : bool
Whether to have the hall of fame migrate. Default is `True`.
topn : int
How many top individuals migrate from each population.
Default is `12`.
should_simplify : bool
Whether to use algebraic simplification in the search. Note that only
a few simple rules are implemented. Default is `True`.
should_optimize_constants : bool
Whether to numerically optimize constants (Nelder-Mead/Newton)
at the end of each iteration. Default is `True`.
optimizer_algorithm : str
Optimization scheme to use for optimizing constants. Can currently
be `NelderMead` or `BFGS`.
Default is `"BFGS"`.
optimizer_nrestarts : int
Number of time to restart the constants optimization process with
different initial conditions.
Default is `2`.
optimize_probability : float
Probability of optimizing the constants during a single iteration of
the evolutionary algorithm.
Default is `0.14`.
optimizer_iterations : int
Number of iterations that the constants optimizer can take.
Default is `8`.
perturbation_factor : float
Constants are perturbed by a max factor of
(perturbation_factor*T + 1). Either multiplied by this or
divided by this.
Default is `0.076`.
tournament_selection_n : int
Number of expressions to consider in each tournament.
Default is `10`.
tournament_selection_p : float
Probability of selecting the best expression in each
tournament. The probability will decay as p*(1-p)^n for other
expressions, sorted by loss.
Default is `0.86`.
procs : int
Number of processes (=number of populations running).
Default is `cpu_count()`.
multithreading : bool
Use multithreading instead of distributed backend.
Using procs=0 will turn off both. Default is `True`.
cluster_manager : str
For distributed computing, this sets the job queue system. Set
to one of "slurm", "pbs", "lsf", "sge", "qrsh", "scyld", or
"htc". If set to one of these, PySR will run in distributed
mode, and use `procs` to figure out how many processes to launch.
Default is `None`.
heap_size_hint_in_bytes : int
For multiprocessing, this sets the `--heap-size-hint` parameter
for new Julia processes. This can be configured when using
multi-node distributed compute, to give a hint to each process
about how much memory they can use before aggressive garbage
collection.
batching : bool
Whether to compare population members on small batches during
evolution. Still uses full dataset for comparing against hall
of fame. Default is `False`.
batch_size : int
The amount of data to use if doing batching. Default is `50`.
fast_cycle : bool
Batch over population subsamples. This is a slightly different
algorithm than regularized evolution, but does cycles 15%
faster. May be algorithmically less efficient.
Default is `False`.
turbo: bool
(Experimental) Whether to use LoopVectorization.jl to speed up the
search evaluation. Certain operators may not be supported.
Does not support 16-bit precision floats.
Default is `False`.
precision : int
What precision to use for the data. By default this is `32`
(float32), but you can select `64` or `16` as well, giving
you 64 or 16 bits of floating point precision, respectively.
If you pass complex data, the corresponding complex precision
will be used (i.e., `64` for complex128, `32` for complex64).
Default is `32`.
enable_autodiff : bool
Whether to create derivative versions of operators for automatic
differentiation. This is only necessary if you wish to compute
the gradients of an expression within a custom loss function.
Default is `False`.
random_state : int, Numpy RandomState instance or None
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
Default is `None`.
deterministic : bool
Make a PySR search give the same result every run.
To use this, you must turn off parallelism
(with `procs`=0, `multithreading`=False),
and set `random_state` to a fixed seed.
Default is `False`.
warm_start : bool
Tells fit to continue from where the last call to fit finished.
If false, each call to fit will be fresh, overwriting previous results.
Default is `False`.
verbosity : int
What verbosity level to use. 0 means minimal print statements.
Default is `1`.
update_verbosity : int
What verbosity level to use for package updates.
Will take value of `verbosity` if not given.
Default is `None`.
print_precision : int
How many significant digits to print for floats. Default is `5`.
progress : bool
Whether to use a progress bar instead of printing to stdout.
Default is `True`.
equation_file : str
Where to save the files (.csv extension).
Default is `None`.
temp_equation_file : bool
Whether to put the hall of fame file in the temp directory.
Deletion is then controlled with the `delete_tempfiles`
parameter.
Default is `False`.
tempdir : str
directory for the temporary files. Default is `None`.
delete_tempfiles : bool
Whether to delete the temporary files after finishing.
Default is `True`.
julia_project : str
A Julia environment location containing a Project.toml
(and potentially the source code for SymbolicRegression.jl).
Default gives the Python package directory, where a
Project.toml file should be present from the install.
update: bool
Whether to automatically update Julia packages when `fit` is called.
You should make sure that PySR is up-to-date itself first, as
the packaged Julia packages may not necessarily include all
updated dependencies.
Default is `False`.
output_jax_format : bool
Whether to create a 'jax_format' column in the output,
containing jax-callable functions and the default parameters in
a jax array.
Default is `False`.
output_torch_format : bool
Whether to create a 'torch_format' column in the output,
containing a torch module with trainable parameters.
Default is `False`.
extra_sympy_mappings : dict[str, Callable]
Provides mappings between custom `binary_operators` or
`unary_operators` defined in julia strings, to those same
operators defined in sympy.
E.G if `unary_operators=["inv(x)=1/x"]`, then for the fitted
model to be export to sympy, `extra_sympy_mappings`
would be `{"inv": lambda x: 1/x}`.
Default is `None`.
extra_jax_mappings : dict[Callable, str]
Similar to `extra_sympy_mappings` but for model export
to jax. The dictionary maps sympy functions to jax functions.
For example: `extra_jax_mappings={sympy.sin: "jnp.sin"}` maps
the `sympy.sin` function to the equivalent jax expression `jnp.sin`.
Default is `None`.
extra_torch_mappings : dict[Callable, Callable]
The same as `extra_jax_mappings` but for model export
to pytorch. Note that the dictionary keys should be callable
pytorch expressions.
For example: `extra_torch_mappings={sympy.sin: torch.sin}`.
Default is `None`.
denoise : bool
Whether to use a Gaussian Process to denoise the data before
inputting to PySR. Can help PySR fit noisy data.
Default is `False`.
select_k_features : int
Whether to run feature selection in Python using random forests,
before passing to the symbolic regression code. None means no
feature selection; an int means select that many features.
Default is `None`.
julia_kwargs : dict
Keyword arguments to pass to `julia.core.Julia(...)` to initialize
the Julia runtime. The default, when `None`, is to set `threads` equal
to `procs`, and `optimize` to 3.
Default is `None`.
**kwargs : dict
Supports deprecated keyword arguments. Other arguments will
result in an error.
Attributes
----------
equations_ : pandas.DataFrame | list[pandas.DataFrame]
Processed DataFrame containing the results of model fitting.
n_features_in_ : int
Number of features seen during :term:`fit`.
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
display_feature_names_in_ : ndarray of shape (`n_features_in_`,)
Pretty names of features, used only during printing.
X_units_ : list[str] of length n_features
Units of each variable in the training dataset, `X`.
y_units_ : str | list[str] of length n_out
Units of each variable in the training dataset, `y`.
nout_ : int
Number of output dimensions.
selection_mask_ : list[int] of length `select_k_features`
List of indices for input features that are selected when
`select_k_features` is set.
tempdir_ : Path
Path to the temporary equations directory.
equation_file_ : str
Output equation file name produced by the julia backend.
raw_julia_state_ : tuple[list[PyCall.jlwrap], PyCall.jlwrap]
The state for the julia SymbolicRegression.jl backend post fitting.
equation_file_contents_ : list[pandas.DataFrame]
Contents of the equation file output by the Julia backend.
show_pickle_warnings_ : bool
Whether to show warnings about what attributes can be pickled.
Examples
--------
```python
>>> import numpy as np
>>> from pysr import PySRRegressor
>>> randstate = np.random.RandomState(0)
>>> X = 2 * randstate.randn(100, 5)
>>> # y = 2.5382 * cos(x_3) + x_0 - 0.5
>>> y = 2.5382 * np.cos(X[:, 3]) + X[:, 0] ** 2 - 0.5
>>> model = PySRRegressor(
... niterations=40,
... binary_operators=["+", "*"],
... unary_operators=[
... "cos",
... "exp",
... "sin",
... "inv(x) = 1/x", # Custom operator (julia syntax)
... ],
... model_selection="best",
... loss="loss(x, y) = (x - y)^2", # Custom loss function (julia syntax)
... )
>>> model.fit(X, y)
>>> model
PySRRegressor.equations_ = [
0 0.000000 3.8552167 3.360272e+01 1
1 1.189847 (x0 * x0) 3.110905e+00 3
2 0.010626 ((x0 * x0) + -0.25573406) 3.045491e+00 5
3 0.896632 (cos(x3) + (x0 * x0)) 1.242382e+00 6
4 0.811362 ((x0 * x0) + (cos(x3) * 2.4384754)) 2.451971e-01 8
5 >>>> 13.733371 (((cos(x3) * 2.5382) + (x0 * x0)) + -0.5) 2.889755e-13 10
6 0.194695 ((x0 * x0) + (((cos(x3) + -0.063180044) * 2.53... 1.957723e-13 12
7 0.006988 ((x0 * x0) + (((cos(x3) + -0.32505524) * 1.538... 1.944089e-13 13
8 0.000955 (((((x0 * x0) + cos(x3)) + -0.8251649) + (cos(... 1.940381e-13 15
]
>>> model.score(X, y)
1.0
>>> model.predict(np.array([1,2,3,4,5]))
array([-1.15907818, -1.15907818, -1.15907818, -1.15907818, -1.15907818])
```
"""
def __init__(
self,
model_selection: Literal["best", "accuracy", "score"] = "best",
*,
binary_operators: Optional[List[str]] = None,
unary_operators: Optional[List[str]] = None,
niterations: int = 40,
populations: int = 15,
population_size: int = 33,
max_evals: Optional[int] = None,
maxsize: int = 20,
maxdepth: Optional[int] = None,
warmup_maxsize_by: Optional[float] = None,
timeout_in_seconds: Optional[float] = None,
constraints: Optional[Dict[str, Union[int, Tuple[int, int]]]] = None,
nested_constraints: Optional[Dict[str, Dict[str, int]]] = None,
loss: Optional[str] = None,
full_objective: Optional[str] = None,
complexity_of_operators: Optional[Dict[str, Union[int, float]]] = None,
complexity_of_constants: Union[int, float] = 1,
complexity_of_variables: Union[int, float] = 1,
parsimony: float = 0.0032,
dimensional_constraint_penalty: Optional[float] = None,
use_frequency: bool = True,
use_frequency_in_tournament: bool = True,
adaptive_parsimony_scaling: float = 20.0,
alpha: float = 0.1,
annealing: bool = False,
early_stop_condition: Optional[Union[float, str]] = None,
ncyclesperiteration: int = 550,
fraction_replaced: float = 0.000364,
fraction_replaced_hof: float = 0.035,
weight_add_node: float = 0.79,
weight_insert_node: float = 5.1,
weight_delete_node: float = 1.7,
weight_do_nothing: float = 0.21,
weight_mutate_constant: float = 0.048,
weight_mutate_operator: float = 0.47,
weight_swap_operands: float = 0.0,
weight_randomize: float = 0.00023,
weight_simplify: float = 0.0020,
weight_optimize: float = 0.0,
crossover_probability: float = 0.066,
skip_mutation_failures: bool = True,
migration: bool = True,
hof_migration: bool = True,
topn: int = 12,
should_simplify: Optional[bool] = None,
should_optimize_constants: bool = True,
optimizer_algorithm: Literal["BFGS", "NelderMead"] = "BFGS",
optimizer_nrestarts: int = 2,
optimize_probability: float = 0.14,
optimizer_iterations: int = 8,
perturbation_factor: float = 0.076,
tournament_selection_n: int = 10,
tournament_selection_p: float = 0.86,
procs: int = cpu_count(),
multithreading: Optional[bool] = None,
cluster_manager: Optional[
Literal["slurm", "pbs", "lsf", "sge", "qrsh", "scyld", "htc"]
] = None,
heap_size_hint_in_bytes: Optional[int] = None,
batching: bool = False,
batch_size: int = 50,
fast_cycle: bool = False,
turbo: bool = False,
precision: int = 32,
enable_autodiff: bool = False,
random_state=None,
deterministic: bool = False,
warm_start: bool = False,
verbosity: int = 1,
update_verbosity: Optional[int] = None,
print_precision: int = 5,
progress: bool = True,
equation_file: Optional[str] = None,
temp_equation_file: bool = False,
tempdir: Optional[str] = None,
delete_tempfiles: bool = True,
julia_project: Optional[str] = None,
update: bool = False,
output_jax_format: bool = False,
output_torch_format: bool = False,
extra_sympy_mappings: Optional[Dict[str, Callable]] = None,
extra_torch_mappings: Optional[Dict[Callable, Callable]] = None,
extra_jax_mappings: Optional[Dict[Callable, str]] = None,
denoise: bool = False,
select_k_features: Optional[int] = None,
julia_kwargs: Optional[Dict] = None,
**kwargs,
):
# Hyperparameters
# - Model search parameters
self.model_selection = model_selection
self.binary_operators = binary_operators
self.unary_operators = unary_operators
self.niterations = niterations
self.populations = populations
self.population_size = population_size
self.ncyclesperiteration = ncyclesperiteration
# - Equation Constraints
self.maxsize = maxsize
self.maxdepth = maxdepth
self.constraints = constraints
self.nested_constraints = nested_constraints
self.warmup_maxsize_by = warmup_maxsize_by
self.should_simplify = should_simplify
# - Early exit conditions:
self.max_evals = max_evals
self.timeout_in_seconds = timeout_in_seconds
self.early_stop_condition = early_stop_condition
# - Loss parameters
self.loss = loss
self.full_objective = full_objective
self.complexity_of_operators = complexity_of_operators
self.complexity_of_constants = complexity_of_constants
self.complexity_of_variables = complexity_of_variables
self.parsimony = parsimony
self.dimensional_constraint_penalty = dimensional_constraint_penalty
self.use_frequency = use_frequency
self.use_frequency_in_tournament = use_frequency_in_tournament
self.adaptive_parsimony_scaling = adaptive_parsimony_scaling
self.alpha = alpha
self.annealing = annealing
# - Evolutionary search parameters
# -- Mutation parameters
self.weight_add_node = weight_add_node
self.weight_insert_node = weight_insert_node
self.weight_delete_node = weight_delete_node
self.weight_do_nothing = weight_do_nothing
self.weight_mutate_constant = weight_mutate_constant
self.weight_mutate_operator = weight_mutate_operator
self.weight_swap_operands = weight_swap_operands
self.weight_randomize = weight_randomize
self.weight_simplify = weight_simplify
self.weight_optimize = weight_optimize
self.crossover_probability = crossover_probability
self.skip_mutation_failures = skip_mutation_failures
# -- Migration parameters
self.migration = migration
self.hof_migration = hof_migration
self.fraction_replaced = fraction_replaced
self.fraction_replaced_hof = fraction_replaced_hof
self.topn = topn
# -- Constants parameters
self.should_optimize_constants = should_optimize_constants
self.optimizer_algorithm = optimizer_algorithm
self.optimizer_nrestarts = optimizer_nrestarts
self.optimize_probability = optimize_probability
self.optimizer_iterations = optimizer_iterations
self.perturbation_factor = perturbation_factor
# -- Selection parameters
self.tournament_selection_n = tournament_selection_n
self.tournament_selection_p = tournament_selection_p
# -- Performance parameters
self.procs = procs
self.multithreading = multithreading
self.cluster_manager = cluster_manager
self.heap_size_hint_in_bytes = heap_size_hint_in_bytes
self.batching = batching
self.batch_size = batch_size
self.fast_cycle = fast_cycle
self.turbo = turbo
self.precision = precision
self.enable_autodiff = enable_autodiff
self.random_state = random_state
self.deterministic = deterministic
self.warm_start = warm_start
# Additional runtime parameters
# - Runtime user interface
self.verbosity = verbosity
self.update_verbosity = update_verbosity
self.print_precision = print_precision
self.progress = progress
# - Project management
self.equation_file = equation_file
self.temp_equation_file = temp_equation_file
self.tempdir = tempdir
self.delete_tempfiles = delete_tempfiles
self.julia_project = julia_project
self.update = update
self.output_jax_format = output_jax_format
self.output_torch_format = output_torch_format
self.extra_sympy_mappings = extra_sympy_mappings
self.extra_jax_mappings = extra_jax_mappings
self.extra_torch_mappings = extra_torch_mappings
# Pre-modelling transformation
self.denoise = denoise
self.select_k_features = select_k_features
self.julia_kwargs = julia_kwargs
# Once all valid parameters have been assigned handle the
# deprecated kwargs
if len(kwargs) > 0: # pragma: no cover
deprecated_kwargs = make_deprecated_kwargs_for_pysr_regressor()
for k, v in kwargs.items():
# Handle renamed kwargs
if k in deprecated_kwargs:
updated_kwarg_name = deprecated_kwargs[k]
setattr(self, updated_kwarg_name, v)
warnings.warn(
f"{k} has been renamed to {updated_kwarg_name} in PySRRegressor. "
"Please use that instead.",
FutureWarning,
)
# Handle kwargs that have been moved to the fit method
elif k in ["weights", "variable_names", "Xresampled"]:
warnings.warn(
f"{k} is a data dependant parameter so should be passed when fit is called. "
f"Ignoring parameter; please pass {k} during the call to fit instead.",
FutureWarning,
)
else:
raise TypeError(
f"{k} is not a valid keyword argument for PySRRegressor."
)
@classmethod
def from_file(
cls,
equation_file,
*,
binary_operators=None,
unary_operators=None,
n_features_in=None,
feature_names_in=None,
selection_mask=None,
nout=1,
**pysr_kwargs,
):
"""
Create a model from a saved model checkpoint or equation file.
Parameters
----------
equation_file : str
Path to a pickle file containing a saved model, or a csv file
containing equations.
binary_operators : list[str]
The same binary operators used when creating the model.
Not needed if loading from a pickle file.
unary_operators : list[str]
The same unary operators used when creating the model.
Not needed if loading from a pickle file.
n_features_in : int
Number of features passed to the model.
Not needed if loading from a pickle file.
feature_names_in : list[str]
Names of the features passed to the model.
Not needed if loading from a pickle file.
selection_mask : list[bool]
If using select_k_features, you must pass `model.selection_mask_` here.
Not needed if loading from a pickle file.
nout : int
Number of outputs of the model.
Not needed if loading from a pickle file.
Default is `1`.
**pysr_kwargs : dict
Any other keyword arguments to initialize the PySRRegressor object.
These will overwrite those stored in the pickle file.
Not needed if loading from a pickle file.
Returns
-------
model : PySRRegressor
The model with fitted equations.
"""
pkl_filename = _csv_filename_to_pkl_filename(equation_file)
# Try to load model from <equation_file>.pkl
print(f"Checking if {pkl_filename} exists...")
if os.path.exists(pkl_filename):
print(f"Loading model from {pkl_filename}")
assert binary_operators is None
assert unary_operators is None
assert n_features_in is None
with open(pkl_filename, "rb") as f:
model = pkl.load(f)
# Change equation_file_ to be in the same dir as the pickle file
base_dir = os.path.dirname(pkl_filename)
base_equation_file = os.path.basename(model.equation_file_)
model.equation_file_ = os.path.join(base_dir, base_equation_file)
# Update any parameters if necessary, such as
# extra_sympy_mappings:
model.set_params(**pysr_kwargs)
if "equations_" not in model.__dict__ or model.equations_ is None:
model.refresh()
return model
# Else, we re-create it.
print(
f"{pkl_filename} does not exist, "
"so we must create the model from scratch."
)
assert binary_operators is not None or unary_operators is not None
assert n_features_in is not None
# TODO: copy .bkup file if exists.
model = cls(
equation_file=equation_file,
binary_operators=binary_operators,
unary_operators=unary_operators,
**pysr_kwargs,
)
model.nout_ = nout
model.n_features_in_ = n_features_in
if feature_names_in is None:
model.feature_names_in_ = np.array([f"x{i}" for i in range(n_features_in)])
model.display_feature_names_in_ = np.array(
[f"x{_subscriptify(i)}" for i in range(n_features_in)]
)
else:
assert len(feature_names_in) == n_features_in
model.feature_names_in_ = feature_names_in
model.display_feature_names_in_ = feature_names_in
if selection_mask is None:
model.selection_mask_ = np.ones(n_features_in, dtype=bool)
else:
model.selection_mask_ = selection_mask
model.refresh(checkpoint_file=equation_file)
return model
def __repr__(self):
"""
Print all current equations fitted by the model.