@@ -12,13 +12,36 @@ import FactFormats._
12
12
13
13
object FactDiff {
14
14
15
- def scoobiJob (input1 : String , input2 : String , outputPath : String , errorPath : String ): ScoobiAction [Unit ] = {
15
+ def partitionFacts (input1 : String , input2 : String , outputPath : String ): ScoobiAction [Unit ] = for {
16
+ res <- ScoobiAction .scoobiJob({ implicit sc : ScoobiConfiguration =>
17
+ val dlist1 = PartitionFactThriftStorageV1 .PartitionedFactThriftLoader (List (input1)).loadScoobi.map({
18
+ case -\/ (e) => sys.error(s " Can not parse fact - ${e}" )
19
+ case \/- (f) => f
20
+ })
21
+ val dlist2 = PartitionFactThriftStorageV1 .PartitionedFactThriftLoader (List (input2)).loadScoobi.map({
22
+ case -\/ (e) => sys.error(s " Can not parse fact - ${e}" )
23
+ case \/- (f) => f
24
+ })
25
+ (dlist1, dlist2)
26
+ })
27
+ (dlist1, dlist2) = res
28
+ _ <- scoobiJob(dlist1, dlist2, outputPath)
29
+ } yield ()
30
+
31
+ def flatFacts (input1 : String , input2 : String , outputPath : String ): ScoobiAction [Unit ] = for {
32
+ res <- ScoobiAction .scoobiJob({ implicit sc : ScoobiConfiguration =>
33
+ val dlist1 = valueFromSequenceFile[Fact ](input1)
34
+ val dlist2 = valueFromSequenceFile[Fact ](input2)
35
+ (dlist1, dlist2)
36
+ })
37
+ (dlist1, dlist2) = res
38
+ _ <- scoobiJob(dlist1, dlist2, outputPath)
39
+ } yield ()
40
+
41
+ def scoobiJob (first_facts : DList [Fact ], second_facts : DList [Fact ], outputPath : String ): ScoobiAction [Unit ] = {
16
42
ScoobiAction .scoobiJob({ implicit sc : ScoobiConfiguration =>
17
- val (first_errs, first_facts) = byflag(PartitionFactThriftStorageV1 .PartitionedFactThriftLoader (List (input1)).loadScoobi, true )
18
- val (second_errs, second_facts) = byflag(PartitionFactThriftStorageV1 .PartitionedFactThriftLoader (List (input2)).loadScoobi, false )
19
43
20
- val errors = first_errs ++ second_errs
21
- val facts = first_facts ++ second_facts
44
+ val facts = first_facts.map((true , _)) ++ second_facts.map((false , _))
22
45
23
46
val grp = facts.groupBy({ case (flag, fact) => (fact.entity, fact.featureId.toString, fact.date.int, fact.time.seconds, fact.value.stringValue) })
24
47
@@ -31,28 +54,12 @@ object FactDiff {
31
54
})
32
55
33
56
val out : DList [String ] = diff.map({
34
- case (true , fact) :: Nil => s " Fact ' ${fact}' does not exist in ${ input2} "
35
- case (false , fact) :: Nil => s " Fact ' ${fact}' does not exist in ${ input1} "
57
+ case (true , fact) :: Nil => s " Fact ' ${fact}' does not exist in input2 "
58
+ case (false , fact) :: Nil => s " Fact ' ${fact}' does not exist in input1 "
36
59
case g => s " Found duplicates - ' ${g}' "
37
60
})
38
61
39
- val error_out : DList [String ] = errors.map({
40
- case (true , e) => s " ${e.message} - ${input1}"
41
- case (false , e) => s " ${e.message} - ${input2}"
42
- })
43
-
44
- persist(error_out.toTextFile(errorPath, overwrite = true ), out.toTextFile(outputPath, overwrite = true ))
62
+ persist(out.toTextFile(outputPath, overwrite = true ))
45
63
})
46
64
}
47
-
48
- def byflag (dlist : DList [ParseError \/ Fact ], flag : Boolean ): (DList [(Boolean , ParseError )], DList [(Boolean , Fact )]) = {
49
- val errs = dlist.collect {
50
- case -\/ (e) => (flag, e)
51
- }
52
-
53
- val facts = dlist.collect {
54
- case \/- (f) => (flag, f)
55
- }
56
- (errs, facts)
57
- }
58
65
}
0 commit comments