Môžete zapuzdrenia logika ako udf
a vytvoriť NewArray
na tomto základe.
Som si požičal logiky na identifikáciu prvkov poľa sčítanie do cieľová hodnota od tu.
from pyspark.sql.types import ArrayType, DoubleType
from pyspark.sql.functions import udf
from decimal import Decimal
data = [([0.0001,2.5,3.0,0.0031], 0.0032),
([2.5, 1.0, 0.5, 3.0], 3.0),
([1.0, 1.0, 1.5, 1.0], 4.5),
([], 1.0),
(None, 1.0),
([1.0,2.0], None),]
df = spark.createDataFrame(data, ("Array", "Target", ))
@udf(returnType=ArrayType(DoubleType()))
def find_values_summing_to_target(array, target):
def subset_sum(numbers, target, partial, result):
s = sum(partial)
# check if the partial sum is equals to target
if s == target:
result.extend(partial)
if s >= target:
return # if we reach the number why bother to continue
for i in range(len(numbers)):
n = numbers[i]
remaining = numbers[i+1:]
subset_sum(remaining, target, partial + [n], result)
result = []
if array is not None and target is not None:
array = [Decimal(str(a)) for a in array]
subset_sum(array, Decimal(str(target)), [], result)
result = [float(r) for r in result]
return result
df.withColumn("NewArray", find_values_summing_to_target("Array", "Target")).show(200, False)
Výstup
+--------------------------+------+--------------------+
|Array |Target|NewArray |
+--------------------------+------+--------------------+
|[1.0E-4, 2.5, 3.0, 0.0031]|0.0032|[1.0E-4, 0.0031] |
|[2.5, 1.0, 0.5, 3.0] |3.0 |[2.5, 0.5, 3.0] |
|[1.0, 1.0, 1.5, 1.0] |4.5 |[1.0, 1.0, 1.5, 1.0]|
|[] |1.0 |[] |
|null |1.0 |[] |
|[1.0, 2.0] |null |[] |
+--------------------------+------+--------------------+