Skip to content

Commit 045fbe4

Browse files
committed
Clone toolshed repo and parse xml files for publications.
1 parent fb7a2b1 commit 045fbe4

File tree

5 files changed

+134
-19
lines changed

5 files changed

+134
-19
lines changed

TVQ.API/Controllers/RepositoriesController.cs

+13-10
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,14 @@ public RepositoriesController(
2121
_context = context;
2222
}
2323

24-
// GET: api/repositories
24+
// GET: api/v1/repositories
2525
[HttpGet]
2626
public IEnumerable<Repository> GetDatas()
2727
{
2828
return _context.Repositories;
2929
}
3030

31-
// GET: api/repositories/5
31+
// GET: api/v1/repositories/5
3232
[HttpGet("{id}")]
3333
public async Task<IActionResult> GetDataItem([FromRoute] int id)
3434
{
@@ -42,7 +42,7 @@ public async Task<IActionResult> GetDataItem([FromRoute] int id)
4242
return Ok(DataItem);
4343
}
4444

45-
// PUT: api/repositories/5
45+
// PUT: api/v1/repositories/5
4646
[HttpPut("{id}")]
4747
public async Task<IActionResult> PutDataItem([FromRoute] int id, [FromBody] Repository dataItem)
4848
{
@@ -69,7 +69,7 @@ public async Task<IActionResult> PutDataItem([FromRoute] int id, [FromBody] Repo
6969
return NoContent();
7070
}
7171

72-
// POST: api/repositories
72+
// POST: api/v1/repositories
7373
[HttpPost]
7474
public async Task<IActionResult> PostDataItem([FromBody] Repository dataItem)
7575
{
@@ -82,7 +82,7 @@ public async Task<IActionResult> PostDataItem([FromBody] Repository dataItem)
8282
return CreatedAtAction("GetRequestItems", new { }, dataItem);
8383
}
8484

85-
// DELETE: api/repositories/5
85+
// DELETE: api/v1/repositories/5
8686
[HttpDelete("{id}")]
8787
public async Task<IActionResult> DeleteDataItem([FromRoute] int id)
8888
{
@@ -99,26 +99,29 @@ public async Task<IActionResult> DeleteDataItem([FromRoute] int id)
9999
return Ok(dataItem);
100100
}
101101

102-
// GET: api/repositories/scan/1
102+
// GET: api/v1/repositories/scan/1
103103
[HttpGet("{id}/scan")]
104104
public async Task<IActionResult> ScanToolsInRepo([FromRoute] int id)
105105
{
106106
if (!ModelState.IsValid)
107107
return BadRequest(ModelState);
108108

109-
var dataItem = await _context.Repositories.FindAsync(id);
110-
if (dataItem == null)
109+
var repository = await _context.Repositories.FindAsync(id);
110+
if (repository == null)
111111
return NotFound();
112112

113113
/// TODO: Can use `ConfigureAwait(false)` in the following to
114114
/// request getting a separate thread for the following task.
115115
/// However, since it is not a process-bound task, it may not
116116
/// be necessary. However, it shall be further investigated.
117-
var tools = await new Crawler().CrawlAsync(dataItem);
117+
var crawler = new Crawler();
118+
var tools = await crawler.GetToolsAsync(repository);
119+
var publs = await crawler.GetPublicationsAsync(repository, tools);
118120

119121
try
120122
{
121123
await _context.Tools.AddRangeAsync(tools);
124+
await _context.Publications.AddRangeAsync(publs);
122125
await _context.SaveChangesAsync();
123126
}
124127
catch (DbUpdateConcurrencyException)
@@ -129,7 +132,7 @@ public async Task<IActionResult> ScanToolsInRepo([FromRoute] int id)
129132
throw;
130133
}
131134

132-
return Ok(dataItem);
135+
return Ok(repository);
133136
}
134137

135138
private bool DataItemExists(int id)

TVQ.API/Controllers/ToolsController.cs

+5-5
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,14 @@ public ToolsController(
2020
_context = context;
2121
}
2222

23-
// GET: api/tools
23+
// GET: api/v1/tools
2424
[HttpGet]
2525
public IEnumerable<Tool> GetDatas()
2626
{
2727
return _context.Tools;
2828
}
2929

30-
// GET: api/tools/5
30+
// GET: api/v1/tools/5
3131
[HttpGet("{id}")]
3232
public async Task<IActionResult> GetDataItem([FromRoute] int id)
3333
{
@@ -46,7 +46,7 @@ public async Task<IActionResult> GetDataItem([FromRoute] int id)
4646
return Ok(DataItem);
4747
}
4848

49-
// PUT: api/tools/5
49+
// PUT: api/v1/tools/5
5050
[HttpPut("{id}")]
5151
public async Task<IActionResult> PutDataItem([FromRoute] int id, [FromBody] Tool DataItem)
5252
{
@@ -81,7 +81,7 @@ public async Task<IActionResult> PutDataItem([FromRoute] int id, [FromBody] Tool
8181
return NoContent();
8282
}
8383

84-
// POST: api/tools
84+
// POST: api/v1/tools
8585
[HttpPost]
8686
public async Task<IActionResult> PostDataItem([FromBody] Tool DataItem)
8787
{
@@ -96,7 +96,7 @@ public async Task<IActionResult> PostDataItem([FromBody] Tool DataItem)
9696
return CreatedAtAction("GetRequestItems", new { }, DataItem);
9797
}
9898

99-
// DELETE: api/tools/5
99+
// DELETE: api/v1/tools/5
100100
[HttpDelete("{id}")]
101101
public async Task<IActionResult> DeleteDataItem([FromRoute] int id)
102102
{

TVQ.API/Crawlers/Crawler.cs

+15-2
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,30 @@ public class Crawler
99
{
1010
public Crawler() { }
1111

12-
public async Task<List<Tool>> CrawlAsync(Repository repo)
12+
public async Task<List<Tool>> GetToolsAsync(Repository repo)
1313
{
1414
switch (repo.Name)
1515
{
1616
case Repo.ToolShed:
17-
return await new ToolShed().Crawl(repo);
17+
return await new ToolShed().GetTools(repo);
1818

1919
default:
2020
/// TODO: replace with an exception.
2121
return new List<Tool>();
2222
}
2323
}
24+
25+
public async Task<List<Publication>> GetPublicationsAsync(Repository repo, List<Tool> tools)
26+
{
27+
switch (repo.Name)
28+
{
29+
case Repo.ToolShed:
30+
return await new ToolShed().GetPublications(repo, tools);
31+
32+
default:
33+
/// TODO: replace with an exception.
34+
return new List<Publication>();
35+
}
36+
}
2437
}
2538
}

TVQ.API/Crawlers/ToolShed.cs

+100-1
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
using Genometric.TVQ.API.Model;
22
using Newtonsoft.Json;
3+
using System;
34
using System.Collections.Generic;
5+
using System.IO;
46
using System.Net.Http;
57
using System.Threading.Tasks;
8+
using System.IO.Compression;
9+
using System.Xml.Linq;
610

711
namespace Genometric.TVQ.API.Crawlers
812
{
@@ -15,7 +19,7 @@ public ToolShed()
1519
_client = new HttpClient();
1620
}
1721

18-
public async Task<List<Tool>> Crawl(Repository repo)
22+
public async Task<List<Tool>> GetTools(Repository repo)
1923
{
2024
HttpResponseMessage response = await _client.GetAsync(repo.URI);
2125
string content;
@@ -32,5 +36,100 @@ public async Task<List<Tool>> Crawl(Repository repo)
3236

3337
return tools;
3438
}
39+
40+
public async Task<List<Publication>> GetPublications(Repository repo, List<Tool> tools)
41+
{
42+
var pubs = new List<Publication>();
43+
var rnd = new Random();
44+
var tmpPath =
45+
Path.GetFullPath(Path.GetTempPath()) +
46+
rnd.Next(100000, 10000000) +
47+
Path.DirectorySeparatorChar;
48+
if (Directory.Exists(tmpPath))
49+
Directory.Delete(tmpPath, true);
50+
Directory.CreateDirectory(tmpPath);
51+
52+
foreach (var tool in tools)
53+
{
54+
string zipFileName = tmpPath + tool.Id;
55+
try
56+
{
57+
/// TODO: creating a new client for every request
58+
/// maybe way too expensive. Maybe should check if
59+
/// client can run multiple concurrent requests in
60+
/// a thread-safe fashion?
61+
new System.Net.WebClient().DownloadFile(
62+
address: new Uri(string.Format(
63+
"https://toolshed.g2.bx.psu.edu/repos/{0}/{1}/archive/tip.zip",
64+
tool.Owner,
65+
tool.Name)),
66+
fileName: zipFileName);
67+
}
68+
catch(Exception e)
69+
{
70+
71+
}
72+
73+
/// Normalizes the path.
74+
/// To avoid `path traversal attacks` from malicious software,
75+
/// there must be a trailing path separator at the end of the path.
76+
string extractPath =
77+
tmpPath + tool.Id + "_" + rnd.Next(100000, 10000000) + "_" +
78+
Path.DirectorySeparatorChar;
79+
Directory.CreateDirectory(extractPath);
80+
81+
try
82+
{
83+
using (ZipArchive archive = ZipFile.OpenRead(zipFileName))
84+
foreach (ZipArchiveEntry entry in archive.Entries)
85+
if (entry.FullName.EndsWith(".xml", StringComparison.OrdinalIgnoreCase))
86+
{
87+
var extractedFileName = extractPath + Path.GetFileName(entry.FullName);
88+
entry.ExtractToFile(extractedFileName);
89+
pubs.AddRange(ExtractCitation(extractedFileName, tool));
90+
}
91+
}
92+
catch (InvalidDataException e)
93+
{
94+
/// This exception is thrown when the Zip archive
95+
/// cannot be read.
96+
}
97+
catch(Exception e)
98+
{
99+
100+
}
101+
}
102+
103+
Directory.Delete(tmpPath, true);
104+
return pubs;
105+
}
106+
107+
private List<Publication> ExtractCitation(string filename, Tool tool)
108+
{
109+
var pubs = new List<Publication>();
110+
XElement toolDoc = XElement.Load(filename);
111+
112+
foreach (var item in toolDoc.Elements("citations").Descendants())
113+
switch (item.Attribute("type").Value.Trim().ToLower())
114+
{
115+
case "doi":
116+
pubs.Add(new Publication()
117+
{
118+
ToolId = tool.Id,
119+
DOI = item.Value
120+
});
121+
break;
122+
123+
case "bibtex":
124+
pubs.Add(new Publication()
125+
{
126+
ToolId = tool.Id,
127+
Citation = item.Value
128+
});
129+
break;
130+
}
131+
132+
return pubs;
133+
}
35134
}
36135
}

docker-compose.dcproj

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
<DockerTargetOS>Linux</DockerTargetOS>
66
<ProjectGuid>44e9c443-0341-4a5d-a160-495121c51ffb</ProjectGuid>
77
<DockerLaunchAction>LaunchBrowser</DockerLaunchAction>
8-
<DockerServiceUrl>{Scheme}://localhost:{ServicePort}/api/repositories</DockerServiceUrl>
8+
<DockerServiceUrl>{Scheme}://localhost:{ServicePort}/api/v1/repositories</DockerServiceUrl>
99
<DockerServiceName>tvq</DockerServiceName>
1010
</PropertyGroup>
1111
<ItemGroup>

0 commit comments

Comments
 (0)