1
1
using Genometric . TVQ . API . Model ;
2
2
using Newtonsoft . Json ;
3
+ using System ;
3
4
using System . Collections . Generic ;
5
+ using System . IO ;
4
6
using System . Net . Http ;
5
7
using System . Threading . Tasks ;
8
+ using System . IO . Compression ;
9
+ using System . Xml . Linq ;
6
10
7
11
namespace Genometric . TVQ . API . Crawlers
8
12
{
@@ -15,7 +19,7 @@ public ToolShed()
15
19
_client = new HttpClient ( ) ;
16
20
}
17
21
18
- public async Task < List < Tool > > Crawl ( Repository repo )
22
+ public async Task < List < Tool > > GetTools ( Repository repo )
19
23
{
20
24
HttpResponseMessage response = await _client . GetAsync ( repo . URI ) ;
21
25
string content ;
@@ -32,5 +36,100 @@ public async Task<List<Tool>> Crawl(Repository repo)
32
36
33
37
return tools ;
34
38
}
39
+
40
+ public async Task < List < Publication > > GetPublications ( Repository repo , List < Tool > tools )
41
+ {
42
+ var pubs = new List < Publication > ( ) ;
43
+ var rnd = new Random ( ) ;
44
+ var tmpPath =
45
+ Path . GetFullPath ( Path . GetTempPath ( ) ) +
46
+ rnd . Next ( 100000 , 10000000 ) +
47
+ Path . DirectorySeparatorChar ;
48
+ if ( Directory . Exists ( tmpPath ) )
49
+ Directory . Delete ( tmpPath , true ) ;
50
+ Directory . CreateDirectory ( tmpPath ) ;
51
+
52
+ foreach ( var tool in tools )
53
+ {
54
+ string zipFileName = tmpPath + tool . Id ;
55
+ try
56
+ {
57
+ /// TODO: creating a new client for every request
58
+ /// maybe way too expensive. Maybe should check if
59
+ /// client can run multiple concurrent requests in
60
+ /// a thread-safe fashion?
61
+ new System . Net . WebClient ( ) . DownloadFile (
62
+ address : new Uri ( string . Format (
63
+ "https://toolshed.g2.bx.psu.edu/repos/{0}/{1}/archive/tip.zip" ,
64
+ tool . Owner ,
65
+ tool . Name ) ) ,
66
+ fileName : zipFileName ) ;
67
+ }
68
+ catch ( Exception e )
69
+ {
70
+
71
+ }
72
+
73
+ /// Normalizes the path.
74
+ /// To avoid `path traversal attacks` from malicious software,
75
+ /// there must be a trailing path separator at the end of the path.
76
+ string extractPath =
77
+ tmpPath + tool . Id + "_" + rnd . Next ( 100000 , 10000000 ) + "_" +
78
+ Path . DirectorySeparatorChar ;
79
+ Directory . CreateDirectory ( extractPath ) ;
80
+
81
+ try
82
+ {
83
+ using ( ZipArchive archive = ZipFile . OpenRead ( zipFileName ) )
84
+ foreach ( ZipArchiveEntry entry in archive . Entries )
85
+ if ( entry . FullName . EndsWith ( ".xml" , StringComparison . OrdinalIgnoreCase ) )
86
+ {
87
+ var extractedFileName = extractPath + Path . GetFileName ( entry . FullName ) ;
88
+ entry . ExtractToFile ( extractedFileName ) ;
89
+ pubs . AddRange ( ExtractCitation ( extractedFileName , tool ) ) ;
90
+ }
91
+ }
92
+ catch ( InvalidDataException e )
93
+ {
94
+ /// This exception is thrown when the Zip archive
95
+ /// cannot be read.
96
+ }
97
+ catch ( Exception e )
98
+ {
99
+
100
+ }
101
+ }
102
+
103
+ Directory . Delete ( tmpPath , true ) ;
104
+ return pubs ;
105
+ }
106
+
107
+ private List < Publication > ExtractCitation ( string filename , Tool tool )
108
+ {
109
+ var pubs = new List < Publication > ( ) ;
110
+ XElement toolDoc = XElement . Load ( filename ) ;
111
+
112
+ foreach ( var item in toolDoc . Elements ( "citations" ) . Descendants ( ) )
113
+ switch ( item . Attribute ( "type" ) . Value . Trim ( ) . ToLower ( ) )
114
+ {
115
+ case "doi" :
116
+ pubs . Add ( new Publication ( )
117
+ {
118
+ ToolId = tool . Id ,
119
+ DOI = item . Value
120
+ } ) ;
121
+ break ;
122
+
123
+ case "bibtex" :
124
+ pubs . Add ( new Publication ( )
125
+ {
126
+ ToolId = tool . Id ,
127
+ Citation = item . Value
128
+ } ) ;
129
+ break ;
130
+ }
131
+
132
+ return pubs ;
133
+ }
35
134
}
36
135
}
0 commit comments