-
Notifications
You must be signed in to change notification settings - Fork 789
/
Copy pathDataUriParser.cs
182 lines (156 loc) · 6.37 KB
/
DataUriParser.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System;
#if NET8_0_OR_GREATER
using System.Buffers.Text;
#endif
using System.Diagnostics;
using System.Net;
using System.Net.Http.Headers;
using System.Text;
namespace Microsoft.Extensions.AI;
/// <summary>
/// Minimal data URI parser based on RFC 2397: https://datatracker.ietf.org/doc/html/rfc2397.
/// </summary>
internal static class DataUriParser
{
public static string Scheme => "data:";
public static DataUri Parse(ReadOnlyMemory<char> dataUri)
{
// Validate, then trim off the "data:" scheme.
if (!dataUri.Span.StartsWith(Scheme.AsSpan(), StringComparison.OrdinalIgnoreCase))
{
throw new UriFormatException("Invalid data URI format: the data URI must start with 'data:'.");
}
dataUri = dataUri.Slice(Scheme.Length);
// Find the comma separating the metadata from the data.
int commaPos = dataUri.Span.IndexOf(',');
if (commaPos < 0)
{
throw new UriFormatException("Invalid data URI format: the data URI must contain a comma separating the metadata and the data.");
}
ReadOnlyMemory<char> metadata = dataUri.Slice(0, commaPos);
ReadOnlyMemory<char> data = dataUri.Slice(commaPos + 1);
bool isBase64 = false;
// Determine whether the data is Base64-encoded or percent-encoded (Uri-encoded).
// If it's base64-encoded, validate it. If it's Uri-encoded, there's nothing to validate,
// as WebUtility.UrlDecode will successfully decode any input with no sequence considered invalid.
if (metadata.Span.EndsWith(";base64".AsSpan(), StringComparison.OrdinalIgnoreCase))
{
metadata = metadata.Slice(0, metadata.Length - ";base64".Length);
isBase64 = true;
if (!IsValidBase64Data(data.Span))
{
throw new UriFormatException("Invalid data URI format: the data URI is base64-encoded, but the data is not a valid base64 string.");
}
}
// Validate the media type, if present.
string? mediaType = null;
if (!IsValidMediaType(metadata.Span.Trim(), ref mediaType))
{
throw new UriFormatException("Invalid data URI format: the media type is not a valid.");
}
return new DataUri(data, isBase64, mediaType);
}
/// <summary>Validates that a media type is valid, and if successful, ensures we have it as a string.</summary>
public static bool IsValidMediaType(ReadOnlySpan<char> mediaTypeSpan, ref string? mediaType)
{
Debug.Assert(
mediaType is null || mediaTypeSpan.Equals(mediaType.AsSpan(), StringComparison.Ordinal),
"mediaType string should either be null or the same as the span");
// If the media type is empty or all whitespace, normalize it to null.
if (mediaTypeSpan.IsWhiteSpace())
{
mediaType = null;
return true;
}
// For common media types, we can avoid both allocating a string for the span and avoid parsing overheads.
string? knownType = mediaTypeSpan switch
{
"application/json" => "application/json",
"application/octet-stream" => "application/octet-stream",
"application/pdf" => "application/pdf",
"application/xml" => "application/xml",
"audio/mpeg" => "audio/mpeg",
"audio/ogg" => "audio/ogg",
"audio/wav" => "audio/wav",
"image/apng" => "image/apng",
"image/avif" => "image/avif",
"image/bmp" => "image/bmp",
"image/gif" => "image/gif",
"image/jpeg" => "image/jpeg",
"image/png" => "image/png",
"image/svg+xml" => "image/svg+xml",
"image/tiff" => "image/tiff",
"image/webp" => "image/webp",
"text/css" => "text/css",
"text/csv" => "text/csv",
"text/html" => "text/html",
"text/javascript" => "text/javascript",
"text/plain" => "text/plain",
"text/plain;charset=UTF-8" => "text/plain;charset=UTF-8",
"text/xml" => "text/xml",
_ => null,
};
if (knownType is not null)
{
mediaType ??= knownType;
return true;
}
// Otherwise, do the full validation using the same logic as HttpClient.
mediaType ??= mediaTypeSpan.ToString();
return MediaTypeHeaderValue.TryParse(mediaType, out _);
}
/// <summary>Test whether the value is a base64 string without whitespace.</summary>
private static bool IsValidBase64Data(ReadOnlySpan<char> value)
{
if (value.IsEmpty)
{
return true;
}
#if NET8_0_OR_GREATER
return Base64.IsValid(value) && !value.ContainsAny(" \t\r\n");
#else
#pragma warning disable S109 // Magic numbers should not be used
if (value!.Length % 4 != 0)
#pragma warning restore S109
{
return false;
}
var index = value.Length - 1;
// Step back over one or two padding chars
if (value[index] == '=')
{
index--;
}
if (value[index] == '=')
{
index--;
}
// Now traverse over characters
for (var i = 0; i <= index; i++)
{
#pragma warning disable S1067 // Expressions should not be too complex
bool validChar = value[i] is (>= 'A' and <= 'Z') or (>= 'a' and <= 'z') or (>= '0' and <= '9') or '+' or '/';
#pragma warning restore S1067
if (!validChar)
{
return false;
}
}
return true;
#endif
}
/// <summary>Provides the parts of a parsed data URI.</summary>
public sealed class DataUri(ReadOnlyMemory<char> data, bool isBase64, string? mediaType)
{
#pragma warning disable S3604 // False positive: Member initializer values should not be redundant
public string? MediaType { get; } = mediaType;
public ReadOnlyMemory<char> Data { get; } = data;
public bool IsBase64 { get; } = isBase64;
#pragma warning restore S3604
public byte[] ToByteArray() => IsBase64 ?
Convert.FromBase64String(Data.ToString()) :
Encoding.UTF8.GetBytes(WebUtility.UrlDecode(Data.ToString()));
}
}