def distanceBetweenTuples(data1 , data2) :
squaredSum = 0.0
for i in range(len(data1)):
squaredSum = squaredSum + (data1[i] - data2[i])**2
return(squaredSum**0.5)
pythonTuple1 = (1.2, 3.4, 3.2)
pythonTuple2 = (2.4, 2.2, 4.2)
distanceBetweenTuples(pythonTuple1, pythonTuple2)
输出结果:1.9697715603592207
knnDataList = [((3.09,1.97,3.73),'group1'),
((2.96,2.15,4.16),'group1'),
((2.87,1.93,4.39),'group1'),
((3.02,1.55,4.43),'group1'),
((1.80,3.65,2.08),'group2'),
((1.36,4.43,1.95),'group2'),
((1.71,4.35,1.94),'group2'),
((1.03,3.75,2.12),'group2'),
((2.30,3.59,1.99),'group2')]
knnDataRDD = sc.parallelize(knnDataList, 4)
newRecord = [(2.5, 1.7, 4.2)]
newRecordRDD = sc.parallelize(newRecord, 1)
cartesianDataRDD = knnDataRDD.cartesian(newRecordRDD)
cartesianDataRDD.take(5)
输出结果:
[(((3.09, 1.97, 3.73), 'group1'), (2.5, 1.7, 4.2)),
(((2.96, 2.15, 4.16), 'group1'), (2.5, 1.7, 4.2)),
(((2.87, 1.93, 4.39), 'group1'), (2.5, 1.7, 4.2)),
(((3.02, 1.55, 4.43), 'group1'), (2.5, 1.7, 4.2)),
(((1.8, 3.65, 2.08), 'group2'), (2.5, 1.7, 4.2))]
K = 5
groupAndDistanceRDD = cartesianDataRDD.map(lambda data : (data[0][1] ,distanceBetweenTuples(data[0][0], data[1])))
groupAndDistanceRDD.take(5)
[('group1', 0.8011866199581719),
('group1', 0.6447480127925947),
('group1', 0.47528938553264566),
('group1', 0.5880476171195661),
('group2', 2.9642705679475347)]
ourClasses = groupAndDistanceRDD.takeOrdered(K, key = lambda data : data[1])
ourClasses
[('group1', 0.47528938553264566),
('group1', 0.5880476171195661),
('group1', 0.6447480127925947),
('group1', 0.8011866199581719),
('group2', 2.9148241799463652)]
ourClassesGroup = [data[0] for data in ourClasses]
ourClassesGroup
['group1', 'group1', 'group1', 'group1', 'group2']
max(ourClassesGroup,key=ourClassesGroup.count)
'group1'
具体过程:
Step 1. Creating a function to calculate the distance between two tuples.
def distanceBetweenTuples(data1 , data2) :
squaredSum = 0.0
for i in range(len(data1)):
squaredSum = squaredSum + (data1[i] - data2[i])**2
return(squaredSum**0.5)
Step 2. Creating a List of given records and transforming it to RDD.
knnDataList = [((3.09,1.97,3.73),'group1'),
((2.96,2.15,4.16),'group1'),
((2.87,1.93,4.39),'group1'),
((3.02,1.55,4.43),'group1'),
((1.80,3.65,2.08),'group2'),
((1.36,4.43,1.95),'group2'),
((1.71,4.35,1.94),'group2'),
((1.03,3.75,2.12),'group2'),
((2.30,3.59,1.99),'group2')]
K = 5
knnDataRDD = sc.parallelize(knnDataList, 4)
knnDataRDD.take(5)
Step 3. Broadcasting the record value.
newRecord = [(2.5, 1.7, 4.2)]
broadCastedValue = sc.broadcast(newRecord)
broadCastedValue.value
broadCastedValue.value[0]
Step 4. Broadcasting the record value.
groupAndDistanceRDD = knnDataRDD.map(lambda data : (data[1] ,distanceBetweenTuples(data[0], tuple(broadCastedValue.value[0]))))
groupAndDistanceRDD.take(5)
Step 5. Finding the class of new record .
ourClasses = groupAndDistanceRDD.takeOrdered(K, key = lambda data : data[1])
ourClasses
ourClassesGroup = [data[0] for data in ourClasses]
ourClassesGroup
max(ourClassesGroup,key=ourClassesGroup.count)
网友评论