Remove unused tmp path from ingest* and ImportWorkflow

charleso · charleso · commit e4f863d146b2 · 2014-07-02T16:17:05.000+10:00
diff --git a/ivory-cli/src/main/scala/com/ambiata/ivory/cli/ingest.scala b/ivory-cli/src/main/scala/com/ambiata/ivory/cli/ingest.scala
@@ -19,7 +19,7 @@ import scalaz.{DList => _, _}, Scalaz._
 
 object ingest extends IvoryApp {
 
-  case class CliArguments(repo: String, dictionary: Option[String], input: String, namespace: String, tmp: String, timezone: DateTimeZone, runOnSingleMachine: Boolean)
+  case class CliArguments(repo: String, dictionary: Option[String], input: String, namespace: String, timezone: DateTimeZone, runOnSingleMachine: Boolean)
 
   val parser = new scopt.OptionParser[CliArguments]("ingest") {
     head("""
@@ -31,7 +31,6 @@ object ingest extends IvoryApp {
 
     help("help") text "shows this usage text"
     opt[String]('r', "repo") action { (x, c) => c.copy(repo = x) }       required() text "Path to an ivory repository."
-    opt[String]('t', "tmp")        action { (x, c) => c.copy(tmp = x) }        required() text "Path to store tmp data."
     opt[String]('i', "input")      action { (x, c) => c.copy(input = x) }      required() text "Path to data to import."
     opt[String]('d', "dictionary")      action { (x, c) => c.copy(dictionary = Some(x)) }      text "Name of dictionary to use."
     opt[String]('n', "namespace")  action { (x, c) => c.copy(namespace = x) }  required() text "Namespace'."
@@ -41,20 +40,20 @@ object ingest extends IvoryApp {
 
   }
 
-  def cmd = IvoryCmd[CliArguments](parser, CliArguments("", None, "", "", "", DateTimeZone.getDefault, false), HadoopCmd { configuration => c =>
-      val res = onHdfs(new Path(c.repo), c.dictionary, c.namespace, new Path(c.input), new Path(c.tmp), c.timezone, c.runOnSingleMachine)
+  def cmd = IvoryCmd[CliArguments](parser, CliArguments("", None, "", "", DateTimeZone.getDefault, false), HadoopCmd { configuration => c =>
+      val res = onHdfs(new Path(c.repo), c.dictionary, c.namespace, new Path(c.input), c.timezone, c.runOnSingleMachine)
       res.run(configuration.modeIs(com.nicta.scoobi.core.Mode.Cluster)).map {
         case f => List(s"Successfully imported '${c.input}' as ${f} into '${c.repo}'")
       }
     })
 
-  def onHdfs(repo: Path, dictionary: Option[String], namespace: String, input: Path, tmp: Path, timezone: DateTimeZone, runOnSingleMachine: Boolean): ScoobiAction[Factset] =
-    fatrepo.ImportWorkflow.onHdfs(repo, dictionary.map(defaultDictionaryImport(_)), importFeed(input, namespace, runOnSingleMachine), tmp, timezone)
+  def onHdfs(repo: Path, dictionary: Option[String], namespace: String, input: Path, timezone: DateTimeZone, runOnSingleMachine: Boolean): ScoobiAction[Factset] =
+    fatrepo.ImportWorkflow.onHdfs(repo, dictionary.map(defaultDictionaryImport(_)), importFeed(input, namespace, runOnSingleMachine), timezone)
 
-  def defaultDictionaryImport(dictionary: String)(repo: HdfsRepository, name: String, tmpPath: Path): Hdfs[Unit] =
+  def defaultDictionaryImport(dictionary: String)(repo: HdfsRepository, name: String): Hdfs[Unit] =
     DictionaryImporter.onHdfs(repo.root.toHdfs, repo.dictionaryByName(dictionary).toHdfs, name)
 
-  def importFeed(input: Path, namespace: String, runOnSingleMachine: Boolean)(repo: HdfsRepository, factset: Factset, dname: String, tmpPath: Path, errorPath: Path, timezone: DateTimeZone): ScoobiAction[Unit] = for {
+  def importFeed(input: Path, namespace: String, runOnSingleMachine: Boolean)(repo: HdfsRepository, factset: Factset, dname: String, errorPath: Path, timezone: DateTimeZone): ScoobiAction[Unit] = for {
     dict <- ScoobiAction.fromHdfs(IvoryStorage.dictionaryFromIvory(repo, dname))
     conf <- ScoobiAction.scoobiConfiguration
     _    <- if (!runOnSingleMachine)
diff --git a/ivory-cli/src/main/scala/com/ambiata/ivory/cli/ingestBulk.scala b/ivory-cli/src/main/scala/com/ambiata/ivory/cli/ingestBulk.scala
@@ -18,7 +18,7 @@ import scalaz.{DList => _, _}, Scalaz._
 
 object ingestBulk extends IvoryApp {
 
-  case class CliArguments(repo: String, dictionary: Option[String], input: String, tmp: String, timezone: DateTimeZone, optimal: Long, codec: Option[CompressionCodec])
+  case class CliArguments(repo: String, dictionary: Option[String], input: String, timezone: DateTimeZone, optimal: Long, codec: Option[CompressionCodec])
 
   val parser = new scopt.OptionParser[CliArguments]("ingest-bulk") {
     head("""
@@ -32,7 +32,6 @@ object ingestBulk extends IvoryApp {
     opt[Unit]('n', "no-compression")         action { (_, c) => c.copy(codec = None) }    text "Don't use compression."
 
     opt[String]('r', "repo")                 action { (x, c) => c.copy(repo = x) }       required() text "Path to an ivory repository."
-    opt[String]('t', "tmp")                  action { (x, c) => c.copy(tmp = x) }        required() text "Path to store tmp data."
     opt[String]('i', "input")                action { (x, c) => c.copy(input = x) }      required() text "Path to data to import."
     opt[Long]('o', "optimal-input-chunk")    action { (x, c) => c.copy(optimal = x) }      text "Optimal size (in bytes) of input chunk.."
     opt[String]('d', "dictionary")           action { (x, c) => c.copy(dictionary = Some(x)) }      text "Name of dictionary to use."
@@ -45,21 +44,21 @@ object ingestBulk extends IvoryApp {
   type Parts = String
 
   def cmd = IvoryCmd[CliArguments](parser,
-      CliArguments("", None, "", "", DateTimeZone.getDefault, 1024 * 1024 * 256 /* 256MB */, Some(new SnappyCodec)),
+      CliArguments("", None, "", DateTimeZone.getDefault, 1024 * 1024 * 256 /* 256MB */, Some(new SnappyCodec)),
       ScoobiCmd(configuration => c => {
-      val res = onHdfs(new Path(c.repo), c.dictionary, new Path(c.input), new Path(c.tmp), c.timezone, c.optimal, c.codec)
+      val res = onHdfs(new Path(c.repo), c.dictionary, new Path(c.input), c.timezone, c.optimal, c.codec)
       res.run(configuration).map {
         case f => List(s"Successfully imported '${c.input}' as ${f} into '${c.repo}'")
       }
     }))
 
-  def onHdfs(repo: Path, dictionary: Option[String], input: Path, tmp: Path, timezone: DateTimeZone, optimal: Long, codec: Option[CompressionCodec]): ScoobiAction[Factset] =
-    fatrepo.ImportWorkflow.onHdfs(repo, dictionary.map(defaultDictionaryImport(_)), importFeed(input, optimal, codec), tmp, timezone)
+  def onHdfs(repo: Path, dictionary: Option[String], input: Path, timezone: DateTimeZone, optimal: Long, codec: Option[CompressionCodec]): ScoobiAction[Factset] =
+    fatrepo.ImportWorkflow.onHdfs(repo, dictionary.map(defaultDictionaryImport(_)), importFeed(input, optimal, codec), timezone)
 
-  def defaultDictionaryImport(dictionary: String)(repo: HdfsRepository, name: String, tmpPath: Path): Hdfs[Unit] =
+  def defaultDictionaryImport(dictionary: String)(repo: HdfsRepository, name: String): Hdfs[Unit] =
     DictionaryImporter.onHdfs(repo.root.toHdfs, repo.dictionaryByName(dictionary).toHdfs, name)
 
-  def importFeed(input: Path, optimal: Long, codec: Option[CompressionCodec])(repo: HdfsRepository, factset: Factset, dname: String, tmpPath: Path, errorPath: Path, timezone: DateTimeZone): ScoobiAction[Unit] = for {
+  def importFeed(input: Path, optimal: Long, codec: Option[CompressionCodec])(repo: HdfsRepository, factset: Factset, dname: String, errorPath: Path, timezone: DateTimeZone): ScoobiAction[Unit] = for {
     dict <- ScoobiAction.fromHdfs(IvoryStorage.dictionaryFromIvory(repo, dname))
     list <- listing(input)
     conf <- ScoobiAction.scoobiConfiguration
diff --git a/ivory-storage/src/main/scala/com/ambiata/ivory/storage/legacy/fatrepo/ImportWorkflow.scala b/ivory-storage/src/main/scala/com/ambiata/ivory/storage/legacy/fatrepo/ImportWorkflow.scala
@@ -36,13 +36,12 @@ object ImportWorkflow {
   type DictionaryName = String
   type DictionaryPath = Path
   type ErrorPath = Path
-  type TmpPath = Path
-  type ImportDictFunc = (HdfsRepository, DictionaryName, TmpPath) => Hdfs[Unit]
-  type ImportFactsFunc = (HdfsRepository, Factset, DictionaryName, TmpPath, ErrorPath, DateTimeZone) => ScoobiAction[Unit]
+  type ImportDictFunc = (HdfsRepository, DictionaryName) => Hdfs[Unit]
+  type ImportFactsFunc = (HdfsRepository, Factset, DictionaryName, ErrorPath, DateTimeZone) => ScoobiAction[Unit]
 
   private implicit val logger = LogFactory.getLog("ivory.repository.fatrepo.Import")
 
-  def onHdfs(repoPath: Path, importDict: Option[ImportDictFunc], importFacts: ImportFactsFunc, tmpPath: Path, timezone: DateTimeZone): ScoobiAction[Factset] = {
+  def onHdfs(repoPath: Path, importDict: Option[ImportDictFunc], importFacts: ImportFactsFunc, timezone: DateTimeZone): ScoobiAction[Factset] = {
     val start = System.currentTimeMillis
     for {
       sc       <- ScoobiAction.scoobiConfiguration
@@ -53,7 +52,7 @@ object ImportWorkflow {
         println(s"created repository in ${x - start}ms")
         x
       }
-      dname    <- ScoobiAction.fromHdfs(importDictionary(repo, new Path(tmpPath, "dictionaries"), importDict)
+      dname    <- ScoobiAction.fromHdfs(importDictionary(repo, importDict)
 )
       t2 = {
         val x = System.currentTimeMillis
@@ -66,7 +65,7 @@ object ImportWorkflow {
         println(s"created fact set in ${x - t2}ms")
         x
       }
-      _        <- importFacts(repo, factset, dname, new Path(tmpPath, "facts"), new Path(repo.errors.path, factset.name), timezone)
+      _        <- importFacts(repo, factset, dname, new Path(repo.errors.path, factset.name), timezone)
       t4 = {
         val x = System.currentTimeMillis
         println(s"imported fact set in ${x - t3}ms")
@@ -95,7 +94,7 @@ object ImportWorkflow {
     }
   } yield ()
 
-  def importDictionary(repo: HdfsRepository, tmpPath: Path, importer: Option[ImportDictFunc]): Hdfs[String] = importer match {
+  def importDictionary(repo: HdfsRepository, importer: Option[ImportDictFunc]): Hdfs[String] = importer match {
     case None =>
       Hdfs.globPaths(repo.dictionaries.toHdfs, "*").map(dicts =>
         dicts
@@ -110,7 +109,7 @@ object ImportWorkflow {
       for {
         e <- Hdfs.exists(repo.dictionaryByName(name).toHdfs)
         _ <- if(!e) copyLatestDictionary(repo, name) else Hdfs.ok(())
-        _ <- importDict(repo, name, tmpPath)
+        _ <- importDict(repo, name)
         _  = logger.info(s"Successfully imported dictionary '${name}'")
       } yield name
     }