Clone toolshed repo and parse xml files for publications.

VJalili · VJalili · commit 045fbe45a903 · 2019-09-12T09:27:15.000-07:00
diff --git a/TVQ.API/Controllers/RepositoriesController.cs b/TVQ.API/Controllers/RepositoriesController.cs
@@ -21,14 +21,14 @@ public RepositoriesController(
             _context = context;
         }
 
-        // GET: api/repositories
+        // GET: api/v1/repositories
         [HttpGet]
         public IEnumerable<Repository> GetDatas()
         {
             return _context.Repositories;
         }
 
-        // GET: api/repositories/5
+        // GET: api/v1/repositories/5
         [HttpGet("{id}")]
         public async Task<IActionResult> GetDataItem([FromRoute] int id)
         {
@@ -42,7 +42,7 @@ public async Task<IActionResult> GetDataItem([FromRoute] int id)
             return Ok(DataItem);
         }
 
-        // PUT: api/repositories/5
+        // PUT: api/v1/repositories/5
         [HttpPut("{id}")]
         public async Task<IActionResult> PutDataItem([FromRoute] int id, [FromBody] Repository dataItem)
         {
@@ -69,7 +69,7 @@ public async Task<IActionResult> PutDataItem([FromRoute] int id, [FromBody] Repo
             return NoContent();
         }
 
-        // POST: api/repositories
+        // POST: api/v1/repositories
         [HttpPost]
         public async Task<IActionResult> PostDataItem([FromBody] Repository dataItem)
         {
@@ -82,7 +82,7 @@ public async Task<IActionResult> PostDataItem([FromBody] Repository dataItem)
             return CreatedAtAction("GetRequestItems", new { }, dataItem);
         }
 
-        // DELETE: api/repositories/5
+        // DELETE: api/v1/repositories/5
         [HttpDelete("{id}")]
         public async Task<IActionResult> DeleteDataItem([FromRoute] int id)
         {
@@ -99,26 +99,29 @@ public async Task<IActionResult> DeleteDataItem([FromRoute] int id)
             return Ok(dataItem);
         }
 
-        // GET: api/repositories/scan/1
+        // GET: api/v1/repositories/scan/1
         [HttpGet("{id}/scan")]
         public async Task<IActionResult> ScanToolsInRepo([FromRoute] int id)
         {
             if (!ModelState.IsValid)
                 return BadRequest(ModelState);
 
-            var dataItem = await _context.Repositories.FindAsync(id);
-            if (dataItem == null)
+            var repository = await _context.Repositories.FindAsync(id);
+            if (repository == null)
                 return NotFound();
 
             /// TODO: Can use `ConfigureAwait(false)` in the following to 
             /// request getting a separate thread for the following task.
             /// However, since it is not a process-bound task, it may not 
             /// be necessary. However, it shall be further investigated.
-            var tools = await new Crawler().CrawlAsync(dataItem);
+            var crawler = new Crawler();
+            var tools = await crawler.GetToolsAsync(repository);
+            var publs = await crawler.GetPublicationsAsync(repository, tools);
 
             try
             {
                 await _context.Tools.AddRangeAsync(tools);
+                await _context.Publications.AddRangeAsync(publs);
                 await _context.SaveChangesAsync();
             }
             catch (DbUpdateConcurrencyException)
@@ -129,7 +132,7 @@ public async Task<IActionResult> ScanToolsInRepo([FromRoute] int id)
                     throw;
             }
 
-            return Ok(dataItem);
+            return Ok(repository);
         }
 
         private bool DataItemExists(int id)
diff --git a/TVQ.API/Controllers/ToolsController.cs b/TVQ.API/Controllers/ToolsController.cs
@@ -20,14 +20,14 @@ public ToolsController(
             _context = context;
         }
 
-        // GET: api/tools
+        // GET: api/v1/tools
         [HttpGet]
         public IEnumerable<Tool> GetDatas()
         {
             return _context.Tools;
         }
 
-        // GET: api/tools/5
+        // GET: api/v1/tools/5
         [HttpGet("{id}")]
         public async Task<IActionResult> GetDataItem([FromRoute] int id)
         {
@@ -46,7 +46,7 @@ public async Task<IActionResult> GetDataItem([FromRoute] int id)
             return Ok(DataItem);
         }
 
-        // PUT: api/tools/5
+        // PUT: api/v1/tools/5
         [HttpPut("{id}")]
         public async Task<IActionResult> PutDataItem([FromRoute] int id, [FromBody] Tool DataItem)
         {
@@ -81,7 +81,7 @@ public async Task<IActionResult> PutDataItem([FromRoute] int id, [FromBody] Tool
             return NoContent();
         }
 
-        // POST: api/tools
+        // POST: api/v1/tools
         [HttpPost]
         public async Task<IActionResult> PostDataItem([FromBody] Tool DataItem)
         {
@@ -96,7 +96,7 @@ public async Task<IActionResult> PostDataItem([FromBody] Tool DataItem)
             return CreatedAtAction("GetRequestItems", new { }, DataItem);
         }
 
-        // DELETE: api/tools/5
+        // DELETE: api/v1/tools/5
         [HttpDelete("{id}")]
         public async Task<IActionResult> DeleteDataItem([FromRoute] int id)
         {
diff --git a/TVQ.API/Crawlers/Crawler.cs b/TVQ.API/Crawlers/Crawler.cs
@@ -9,17 +9,30 @@ public class Crawler
     {
         public Crawler() { }
 
-        public async Task<List<Tool>> CrawlAsync(Repository repo)
+        public async Task<List<Tool>> GetToolsAsync(Repository repo)
         {
             switch (repo.Name)
             {
                 case Repo.ToolShed:
-                    return await new ToolShed().Crawl(repo);
+                    return await new ToolShed().GetTools(repo);
 
                 default:
                     /// TODO: replace with an exception.
                     return new List<Tool>();
             }
         }
+
+        public async Task<List<Publication>> GetPublicationsAsync(Repository repo, List<Tool> tools)
+        {
+            switch (repo.Name)
+            {
+                case Repo.ToolShed:
+                    return await new ToolShed().GetPublications(repo, tools);
+
+                default:
+                    /// TODO: replace with an exception.
+                    return new List<Publication>();
+            }
+        }
     }
 }
diff --git a/TVQ.API/Crawlers/ToolShed.cs b/TVQ.API/Crawlers/ToolShed.cs
@@ -1,8 +1,12 @@
 ﻿using Genometric.TVQ.API.Model;
 using Newtonsoft.Json;
+using System;
 using System.Collections.Generic;
+using System.IO;
 using System.Net.Http;
 using System.Threading.Tasks;
+using System.IO.Compression;
+using System.Xml.Linq;
 
 namespace Genometric.TVQ.API.Crawlers
 {
@@ -15,7 +19,7 @@ public ToolShed()
             _client = new HttpClient();
         }
 
-        public async Task<List<Tool>> Crawl(Repository repo)
+        public async Task<List<Tool>> GetTools(Repository repo)
         {
             HttpResponseMessage response = await _client.GetAsync(repo.URI);
             string content;
@@ -32,5 +36,100 @@ public async Task<List<Tool>> Crawl(Repository repo)
 
             return tools;
         }
+
+        public async Task<List<Publication>> GetPublications(Repository repo, List<Tool> tools)
+        {
+            var pubs = new List<Publication>();
+            var rnd = new Random();
+            var tmpPath =
+                Path.GetFullPath(Path.GetTempPath()) +
+                rnd.Next(100000, 10000000) +
+                Path.DirectorySeparatorChar;
+            if (Directory.Exists(tmpPath))
+                Directory.Delete(tmpPath, true);
+            Directory.CreateDirectory(tmpPath);
+
+            foreach (var tool in tools)
+            {
+                string zipFileName = tmpPath + tool.Id;
+                try
+                {
+                    /// TODO: creating a new client for every request 
+                    /// maybe way too expensive. Maybe should check if 
+                    /// client can run multiple concurrent requests in 
+                    /// a thread-safe fashion?
+                    new System.Net.WebClient().DownloadFile(
+                        address: new Uri(string.Format(
+                            "https://toolshed.g2.bx.psu.edu/repos/{0}/{1}/archive/tip.zip",
+                            tool.Owner,
+                            tool.Name)),
+                        fileName: zipFileName);
+                }
+                catch(Exception e)
+                {
+
+                }
+
+                /// Normalizes the path.
+                /// To avoid `path traversal attacks` from malicious software, 
+                /// there must be a trailing path separator at the end of the path. 
+                string extractPath =
+                    tmpPath + tool.Id + "_" + rnd.Next(100000, 10000000) + "_" +
+                    Path.DirectorySeparatorChar;
+                Directory.CreateDirectory(extractPath);
+
+                try
+                {
+                    using (ZipArchive archive = ZipFile.OpenRead(zipFileName))
+                        foreach (ZipArchiveEntry entry in archive.Entries)
+                            if (entry.FullName.EndsWith(".xml", StringComparison.OrdinalIgnoreCase))
+                            {
+                                var extractedFileName = extractPath + Path.GetFileName(entry.FullName);
+                                entry.ExtractToFile(extractedFileName);
+                                pubs.AddRange(ExtractCitation(extractedFileName, tool));
+                            }
+                }
+                catch (InvalidDataException e)
+                {
+                    /// This exception is thrown when the Zip archive
+                    /// cannot be read.
+                }
+                catch(Exception e)
+                {
+
+                }
+            }
+
+            Directory.Delete(tmpPath, true);
+            return pubs;
+        }
+
+        private List<Publication> ExtractCitation(string filename, Tool tool)
+        {
+            var pubs = new List<Publication>();
+            XElement toolDoc = XElement.Load(filename);
+
+            foreach (var item in toolDoc.Elements("citations").Descendants())
+                switch (item.Attribute("type").Value.Trim().ToLower())
+                {
+                    case "doi":
+                        pubs.Add(new Publication()
+                        {
+                            ToolId = tool.Id,
+                            DOI = item.Value
+                        });
+                        break;
+
+                    case "bibtex":
+                        pubs.Add(new Publication()
+                        {
+                            ToolId = tool.Id,
+                            Citation = item.Value
+                        });
+                        break;
+                }
+
+            return pubs;
+        }
     }
 }
diff --git a/docker-compose.dcproj b/docker-compose.dcproj
@@ -5,7 +5,7 @@
     <DockerTargetOS>Linux</DockerTargetOS>
     <ProjectGuid>44e9c443-0341-4a5d-a160-495121c51ffb</ProjectGuid>
     <DockerLaunchAction>LaunchBrowser</DockerLaunchAction>
-    <DockerServiceUrl>{Scheme}://localhost:{ServicePort}/api/repositories</DockerServiceUrl>
+    <DockerServiceUrl>{Scheme}://localhost:{ServicePort}/api/v1/repositories</DockerServiceUrl>
     <DockerServiceName>tvq</DockerServiceName>
   </PropertyGroup>
   <ItemGroup>

Original file line number	Diff line number	Diff line change
`@@ -21,14 +21,14 @@ public RepositoriesController(`
`21`	`21`	`_context = context;`
`22`	`22`	`}`
`23`	`23`
`24`		`- // GET: api/repositories`
	`24`	`+ // GET: api/v1/repositories`
`25`	`25`	`[HttpGet]`
`26`	`26`	`public IEnumerable<Repository> GetDatas()`
`27`	`27`	`{`
`28`	`28`	`return _context.Repositories;`
`29`	`29`	`}`
`30`	`30`
`31`		`- // GET: api/repositories/5`
	`31`	`+ // GET: api/v1/repositories/5`
`32`	`32`	`[HttpGet("{id}")]`
`33`	`33`	`public async Task<IActionResult> GetDataItem([FromRoute] int id)`
`34`	`34`	`{`
`@@ -42,7 +42,7 @@ public async Task<IActionResult> GetDataItem([FromRoute] int id)`
`42`	`42`	`return Ok(DataItem);`
`43`	`43`	`}`
`44`	`44`
`45`		`- // PUT: api/repositories/5`
	`45`	`+ // PUT: api/v1/repositories/5`
`46`	`46`	`[HttpPut("{id}")]`
`47`	`47`	`public async Task<IActionResult> PutDataItem([FromRoute] int id, [FromBody] Repository dataItem)`
`48`	`48`	`{`
`@@ -69,7 +69,7 @@ public async Task<IActionResult> PutDataItem([FromRoute] int id, [FromBody] Repo`
`69`	`69`	`return NoContent();`
`70`	`70`	`}`
`71`	`71`
`72`		`- // POST: api/repositories`
	`72`	`+ // POST: api/v1/repositories`
`73`	`73`	`[HttpPost]`
`74`	`74`	`public async Task<IActionResult> PostDataItem([FromBody] Repository dataItem)`
`75`	`75`	`{`
`@@ -82,7 +82,7 @@ public async Task<IActionResult> PostDataItem([FromBody] Repository dataItem)`
`82`	`82`	`return CreatedAtAction("GetRequestItems", new { }, dataItem);`
`83`	`83`	`}`
`84`	`84`
`85`		`- // DELETE: api/repositories/5`
	`85`	`+ // DELETE: api/v1/repositories/5`
`86`	`86`	`[HttpDelete("{id}")]`
`87`	`87`	`public async Task<IActionResult> DeleteDataItem([FromRoute] int id)`
`88`	`88`	`{`
`@@ -99,26 +99,29 @@ public async Task<IActionResult> DeleteDataItem([FromRoute] int id)`
`99`	`99`	`return Ok(dataItem);`
`100`	`100`	`}`
`101`	`101`
`102`		`- // GET: api/repositories/scan/1`
	`102`	`+ // GET: api/v1/repositories/scan/1`
`103`	`103`	`[HttpGet("{id}/scan")]`
`104`	`104`	`public async Task<IActionResult> ScanToolsInRepo([FromRoute] int id)`
`105`	`105`	`{`
`106`	`106`	`if (!ModelState.IsValid)`
`107`	`107`	`return BadRequest(ModelState);`
`108`	`108`
`109`		`- var dataItem = await _context.Repositories.FindAsync(id);`
`110`		`- if (dataItem == null)`
	`109`	`+ var repository = await _context.Repositories.FindAsync(id);`
	`110`	`+ if (repository == null)`
`111`	`111`	`return NotFound();`
`112`	`112`
`113`	`113`	/// TODO: Can use `ConfigureAwait(false)` in the following to
`114`	`114`	`/// request getting a separate thread for the following task.`
`115`	`115`	`/// However, since it is not a process-bound task, it may not`
`116`	`116`	`/// be necessary. However, it shall be further investigated.`
`117`		`- var tools = await new Crawler().CrawlAsync(dataItem);`
	`117`	`+ var crawler = new Crawler();`
	`118`	`+ var tools = await crawler.GetToolsAsync(repository);`
	`119`	`+ var publs = await crawler.GetPublicationsAsync(repository, tools);`
`118`	`120`
`119`	`121`	`try`
`120`	`122`	`{`
`121`	`123`	`await _context.Tools.AddRangeAsync(tools);`
	`124`	`+ await _context.Publications.AddRangeAsync(publs);`
`122`	`125`	`await _context.SaveChangesAsync();`
`123`	`126`	`}`
`124`	`127`	`catch (DbUpdateConcurrencyException)`
`@@ -129,7 +132,7 @@ public async Task<IActionResult> ScanToolsInRepo([FromRoute] int id)`
`129`	`132`	`throw;`
`130`	`133`	`}`
`131`	`134`
`132`		`- return Ok(dataItem);`
	`135`	`+ return Ok(repository);`
`133`	`136`	`}`
`134`	`137`
`135`	`138`	`private bool DataItemExists(int id)`
Original file line number	Diff line number	Diff line change
`@@ -20,14 +20,14 @@ public ToolsController(`
`20`	`20`	`_context = context;`
`21`	`21`	`}`
`22`	`22`
`23`		`- // GET: api/tools`
	`23`	`+ // GET: api/v1/tools`
`24`	`24`	`[HttpGet]`
`25`	`25`	`public IEnumerable<Tool> GetDatas()`
`26`	`26`	`{`
`27`	`27`	`return _context.Tools;`
`28`	`28`	`}`
`29`	`29`
`30`		`- // GET: api/tools/5`
	`30`	`+ // GET: api/v1/tools/5`
`31`	`31`	`[HttpGet("{id}")]`
`32`	`32`	`public async Task<IActionResult> GetDataItem([FromRoute] int id)`
`33`	`33`	`{`
`@@ -46,7 +46,7 @@ public async Task<IActionResult> GetDataItem([FromRoute] int id)`
`46`	`46`	`return Ok(DataItem);`
`47`	`47`	`}`
`48`	`48`
`49`		`- // PUT: api/tools/5`
	`49`	`+ // PUT: api/v1/tools/5`
`50`	`50`	`[HttpPut("{id}")]`
`51`	`51`	`public async Task<IActionResult> PutDataItem([FromRoute] int id, [FromBody] Tool DataItem)`
`52`	`52`	`{`
`@@ -81,7 +81,7 @@ public async Task<IActionResult> PutDataItem([FromRoute] int id, [FromBody] Tool`
`81`	`81`	`return NoContent();`
`82`	`82`	`}`
`83`	`83`
`84`		`- // POST: api/tools`
	`84`	`+ // POST: api/v1/tools`
`85`	`85`	`[HttpPost]`
`86`	`86`	`public async Task<IActionResult> PostDataItem([FromBody] Tool DataItem)`
`87`	`87`	`{`
`@@ -96,7 +96,7 @@ public async Task<IActionResult> PostDataItem([FromBody] Tool DataItem)`
`96`	`96`	`return CreatedAtAction("GetRequestItems", new { }, DataItem);`
`97`	`97`	`}`
`98`	`98`
`99`		`- // DELETE: api/tools/5`
	`99`	`+ // DELETE: api/v1/tools/5`
`100`	`100`	`[HttpDelete("{id}")]`
`101`	`101`	`public async Task<IActionResult> DeleteDataItem([FromRoute] int id)`
`102`	`102`	`{`
Original file line number	Diff line number	Diff line change
`@@ -9,17 +9,30 @@ public class Crawler`
`9`	`9`	`{`
`10`	`10`	`public Crawler() { }`
`11`	`11`
`12`		`- public async Task<List<Tool>> CrawlAsync(Repository repo)`
	`12`	`+ public async Task<List<Tool>> GetToolsAsync(Repository repo)`
`13`	`13`	`{`
`14`	`14`	`switch (repo.Name)`
`15`	`15`	`{`
`16`	`16`	`case Repo.ToolShed:`
`17`		`- return await new ToolShed().Crawl(repo);`
	`17`	`+ return await new ToolShed().GetTools(repo);`
`18`	`18`
`19`	`19`	`default:`
`20`	`20`	`/// TODO: replace with an exception.`
`21`	`21`	`return new List<Tool>();`
`22`	`22`	`}`
`23`	`23`	`}`
	`24`	`+`
	`25`	`+ public async Task<List<Publication>> GetPublicationsAsync(Repository repo, List<Tool> tools)`
	`26`	`+ {`
	`27`	`+ switch (repo.Name)`
	`28`	`+ {`
	`29`	`+ case Repo.ToolShed:`
	`30`	`+ return await new ToolShed().GetPublications(repo, tools);`
	`31`	`+`
	`32`	`+ default:`
	`33`	`+ /// TODO: replace with an exception.`
	`34`	`+ return new List<Publication>();`
	`35`	`+ }`
	`36`	`+ }`
`24`	`37`	`}`
`25`	`38`	`}`