Line data Source code
1 : /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* This Source Code Form is subject to the terms of the Mozilla Public
3 : * License, v. 2.0. If a copy of the MPL was not distributed with this
4 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 :
6 : #include "nsFeedSniffer.h"
7 :
8 : #include "mozilla/Unused.h"
9 :
10 : #include "nsNetCID.h"
11 : #include "nsXPCOM.h"
12 : #include "nsCOMPtr.h"
13 : #include "nsStringStream.h"
14 :
15 : #include "nsBrowserCompsCID.h"
16 :
17 : #include "nsICategoryManager.h"
18 : #include "nsIServiceManager.h"
19 : #include "nsComponentManagerUtils.h"
20 : #include "nsServiceManagerUtils.h"
21 :
22 : #include "nsIStreamConverterService.h"
23 : #include "nsIStreamConverter.h"
24 :
25 : #include "nsIStreamListener.h"
26 :
27 : #include "nsIHttpChannel.h"
28 : #include "nsIMIMEHeaderParam.h"
29 :
30 : #include "nsMimeTypes.h"
31 : #include "nsIURI.h"
32 : #include <algorithm>
33 :
34 : #define TYPE_ATOM "application/atom+xml"
35 : #define TYPE_RSS "application/rss+xml"
36 : #define TYPE_MAYBE_FEED "application/vnd.mozilla.maybe.feed"
37 :
38 : #define NS_RDF "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
39 : #define NS_RSS "http://purl.org/rss/1.0/"
40 :
41 : #define MAX_BYTES 512u
42 :
43 19 : NS_IMPL_ISUPPORTS(nsFeedSniffer,
44 : nsIContentSniffer,
45 : nsIStreamListener,
46 : nsIRequestObserver)
47 :
48 : nsresult
49 1 : nsFeedSniffer::ConvertEncodedData(nsIRequest* request,
50 : const uint8_t* data,
51 : uint32_t length)
52 : {
53 1 : nsresult rv = NS_OK;
54 :
55 1 : mDecodedData = "";
56 2 : nsCOMPtr<nsIHttpChannel> httpChannel(do_QueryInterface(request));
57 1 : if (!httpChannel)
58 0 : return NS_ERROR_NO_INTERFACE;
59 :
60 2 : nsAutoCString contentEncoding;
61 3 : mozilla::Unused << httpChannel->GetResponseHeader(NS_LITERAL_CSTRING("Content-Encoding"),
62 2 : contentEncoding);
63 1 : if (!contentEncoding.IsEmpty()) {
64 0 : nsCOMPtr<nsIStreamConverterService> converterService(do_GetService(NS_STREAMCONVERTERSERVICE_CONTRACTID));
65 0 : if (converterService) {
66 0 : ToLowerCase(contentEncoding);
67 :
68 0 : nsCOMPtr<nsIStreamListener> converter;
69 0 : rv = converterService->AsyncConvertData(contentEncoding.get(),
70 : "uncompressed", this, nullptr,
71 0 : getter_AddRefs(converter));
72 0 : NS_ENSURE_SUCCESS(rv, rv);
73 :
74 0 : converter->OnStartRequest(request, nullptr);
75 :
76 : nsCOMPtr<nsIStringInputStream> rawStream =
77 0 : do_CreateInstance(NS_STRINGINPUTSTREAM_CONTRACTID);
78 0 : if (!rawStream)
79 0 : return NS_ERROR_FAILURE;
80 :
81 0 : rv = rawStream->SetData((const char*)data, length);
82 0 : NS_ENSURE_SUCCESS(rv, rv);
83 :
84 0 : rv = converter->OnDataAvailable(request, nullptr, rawStream, 0, length);
85 0 : NS_ENSURE_SUCCESS(rv, rv);
86 :
87 0 : converter->OnStopRequest(request, nullptr, NS_OK);
88 : }
89 : }
90 1 : return rv;
91 : }
92 :
93 : template<int N>
94 : static bool
95 : StringBeginsWithLowercaseLiteral(nsAString& aString,
96 : const char (&aSubstring)[N])
97 : {
98 : return StringHead(aString, N).LowerCaseEqualsLiteral(aSubstring);
99 : }
100 :
101 : bool
102 0 : HasAttachmentDisposition(nsIHttpChannel* httpChannel)
103 : {
104 0 : if (!httpChannel)
105 0 : return false;
106 :
107 : uint32_t disp;
108 0 : nsresult rv = httpChannel->GetContentDisposition(&disp);
109 :
110 0 : if (NS_SUCCEEDED(rv) && disp == nsIChannel::DISPOSITION_ATTACHMENT)
111 0 : return true;
112 :
113 0 : return false;
114 : }
115 :
116 : /**
117 : * @return the first occurrence of a character within a string buffer,
118 : * or nullptr if not found
119 : */
120 : static const char*
121 0 : FindChar(char c, const char *begin, const char *end)
122 : {
123 0 : for (; begin < end; ++begin) {
124 0 : if (*begin == c)
125 0 : return begin;
126 : }
127 0 : return nullptr;
128 : }
129 :
130 : /**
131 : *
132 : * Determine if a substring is the "documentElement" in the document.
133 : *
134 : * All of our sniffed substrings: <rss, <feed, <rdf:RDF must be the "document"
135 : * element within the XML DOM, i.e. the root container element. Otherwise,
136 : * it's possible that someone embedded one of these tags inside a document of
137 : * another type, e.g. a HTML document, and we don't want to show the preview
138 : * page if the document isn't actually a feed.
139 : *
140 : * @param start
141 : * The beginning of the data being sniffed
142 : * @param end
143 : * The end of the data being sniffed, right before the substring that
144 : * was found.
145 : * @returns true if the found substring is the documentElement, false
146 : * otherwise.
147 : */
148 : static bool
149 0 : IsDocumentElement(const char *start, const char* end)
150 : {
151 : // For every tag in the buffer, check to see if it's a PI, Doctype or
152 : // comment, our desired substring or something invalid.
153 0 : while ( (start = FindChar('<', start, end)) ) {
154 0 : ++start;
155 0 : if (start >= end)
156 0 : return false;
157 :
158 : // Check to see if the character following the '<' is either '?' or '!'
159 : // (processing instruction or doctype or comment)... these are valid nodes
160 : // to have in the prologue.
161 0 : if (*start != '?' && *start != '!')
162 0 : return false;
163 :
164 : // Now advance the iterator until the '>' (We do this because we don't want
165 : // to sniff indicator substrings that are embedded within other nodes, e.g.
166 : // comments: <!-- <rdf:RDF .. > -->
167 0 : start = FindChar('>', start, end);
168 0 : if (!start)
169 0 : return false;
170 :
171 0 : ++start;
172 : }
173 0 : return true;
174 : }
175 :
176 : /**
177 : * Determines whether or not a string exists as the root element in an XML data
178 : * string buffer.
179 : * @param dataString
180 : * The data being sniffed
181 : * @param substring
182 : * The substring being tested for existence and root-ness.
183 : * @returns true if the substring exists and is the documentElement, false
184 : * otherwise.
185 : */
186 : static bool
187 3 : ContainsTopLevelSubstring(nsACString& dataString, const char *substring)
188 : {
189 3 : nsACString::const_iterator start, end;
190 3 : dataString.BeginReading(start);
191 3 : dataString.EndReading(end);
192 :
193 3 : if (!FindInReadable(nsCString(substring), start, end)){
194 3 : return false;
195 : }
196 :
197 0 : auto offset = start.get() - dataString.Data();
198 :
199 0 : const char *begin = dataString.BeginReading();
200 :
201 : // Only do the validation when we find the substring.
202 0 : return IsDocumentElement(begin, begin + offset);
203 : }
204 :
205 : NS_IMETHODIMP
206 3 : nsFeedSniffer::GetMIMETypeFromContent(nsIRequest* request,
207 : const uint8_t* data,
208 : uint32_t length,
209 : nsACString& sniffedType)
210 : {
211 6 : nsCOMPtr<nsIHttpChannel> channel(do_QueryInterface(request));
212 3 : if (!channel)
213 2 : return NS_ERROR_NO_INTERFACE;
214 :
215 : // Check that this is a GET request, since you can't subscribe to a POST...
216 2 : nsAutoCString method;
217 1 : mozilla::Unused << channel->GetRequestMethod(method);
218 1 : if (!method.EqualsLiteral("GET")) {
219 0 : sniffedType.Truncate();
220 0 : return NS_OK;
221 : }
222 :
223 : // We need to find out if this is a load of a view-source document. In this
224 : // case we do not want to override the content type, since the source display
225 : // does not need to be converted from feed format to XUL. More importantly,
226 : // we don't want to change the content type from something
227 : // nsContentDLF::CreateInstance knows about (e.g. application/xml, text/html
228 : // etc) to something that only the application fe knows about (maybe.feed)
229 : // thus deactivating syntax highlighting.
230 2 : nsCOMPtr<nsIURI> originalURI;
231 1 : channel->GetOriginalURI(getter_AddRefs(originalURI));
232 :
233 2 : nsAutoCString scheme;
234 1 : originalURI->GetScheme(scheme);
235 1 : if (scheme.EqualsLiteral("view-source")) {
236 0 : sniffedType.Truncate();
237 0 : return NS_OK;
238 : }
239 :
240 : // Check the Content-Type to see if it is set correctly. If it is set to
241 : // something specific that we think is a reliable indication of a feed, don't
242 : // bother sniffing since we assume the site maintainer knows what they're
243 : // doing.
244 2 : nsAutoCString contentType;
245 1 : channel->GetContentType(contentType);
246 2 : bool noSniff = contentType.EqualsLiteral(TYPE_RSS) ||
247 2 : contentType.EqualsLiteral(TYPE_ATOM);
248 :
249 : // Check to see if this was a feed request from the location bar or from
250 : // the feed: protocol. This is also a reliable indication.
251 : // The value of the header doesn't matter.
252 1 : if (!noSniff) {
253 2 : nsAutoCString sniffHeader;
254 : nsresult foundHeader =
255 4 : channel->GetRequestHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"),
256 3 : sniffHeader);
257 1 : noSniff = NS_SUCCEEDED(foundHeader);
258 : }
259 :
260 1 : if (noSniff) {
261 : // check for an attachment after we have a likely feed.
262 0 : if(HasAttachmentDisposition(channel)) {
263 0 : sniffedType.Truncate();
264 0 : return NS_OK;
265 : }
266 :
267 : // set the feed header as a response header, since we have good metadata
268 : // telling us that the feed is supposed to be RSS or Atom
269 : mozilla::DebugOnly<nsresult> rv =
270 0 : channel->SetResponseHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"),
271 0 : NS_LITERAL_CSTRING("1"), false);
272 0 : MOZ_ASSERT(NS_SUCCEEDED(rv));
273 0 : sniffedType.AssignLiteral(TYPE_MAYBE_FEED);
274 0 : return NS_OK;
275 : }
276 :
277 : // Don't sniff arbitrary types. Limit sniffing to situations that
278 : // we think can reasonably arise.
279 2 : if (!contentType.EqualsLiteral(TEXT_HTML) &&
280 1 : !contentType.EqualsLiteral(APPLICATION_OCTET_STREAM) &&
281 : // Same criterion as XMLHttpRequest. Should we be checking for "+xml"
282 : // and check for text/xml and application/xml by hand instead?
283 0 : contentType.Find("xml") == -1) {
284 0 : sniffedType.Truncate();
285 0 : return NS_OK;
286 : }
287 :
288 : // Now we need to potentially decompress data served with
289 : // Content-Encoding: gzip
290 1 : nsresult rv = ConvertEncodedData(request, data, length);
291 1 : if (NS_FAILED(rv))
292 0 : return rv;
293 :
294 : // We cap the number of bytes to scan at MAX_BYTES to prevent picking up
295 : // false positives by accidentally reading document content, e.g. a "how to
296 : // make a feed" page.
297 : const char* testData;
298 1 : if (mDecodedData.IsEmpty()) {
299 1 : testData = (const char*)data;
300 1 : length = std::min(length, MAX_BYTES);
301 : } else {
302 0 : testData = mDecodedData.get();
303 0 : length = std::min(mDecodedData.Length(), MAX_BYTES);
304 : }
305 :
306 : // The strategy here is based on that described in:
307 : // http://blogs.msdn.com/rssteam/articles/PublishersGuide.aspx
308 : // for interoperarbility purposes.
309 :
310 : // Thus begins the actual sniffing.
311 2 : nsDependentCSubstring dataString((const char*)testData, length);
312 :
313 1 : bool isFeed = false;
314 :
315 : // RSS 0.91/0.92/2.0
316 1 : isFeed = ContainsTopLevelSubstring(dataString, "<rss");
317 :
318 : // Atom 1.0
319 1 : if (!isFeed)
320 1 : isFeed = ContainsTopLevelSubstring(dataString, "<feed");
321 :
322 : // RSS 1.0
323 1 : if (!isFeed) {
324 1 : bool foundNS_RDF = FindInReadable(NS_LITERAL_CSTRING(NS_RDF), dataString);
325 1 : bool foundNS_RSS = FindInReadable(NS_LITERAL_CSTRING(NS_RSS), dataString);
326 2 : isFeed = ContainsTopLevelSubstring(dataString, "<rdf:RDF") &&
327 1 : foundNS_RDF && foundNS_RSS;
328 : }
329 :
330 : // If we sniffed a feed, coerce our internal type
331 1 : if (isFeed && !HasAttachmentDisposition(channel))
332 0 : sniffedType.AssignLiteral(TYPE_MAYBE_FEED);
333 : else
334 1 : sniffedType.Truncate();
335 1 : return NS_OK;
336 : }
337 :
338 : NS_IMETHODIMP
339 0 : nsFeedSniffer::OnStartRequest(nsIRequest* request, nsISupports* context)
340 : {
341 0 : return NS_OK;
342 : }
343 :
344 : nsresult
345 0 : nsFeedSniffer::AppendSegmentToString(nsIInputStream* inputStream,
346 : void* closure,
347 : const char* rawSegment,
348 : uint32_t toOffset,
349 : uint32_t count,
350 : uint32_t* writeCount)
351 : {
352 0 : nsCString* decodedData = static_cast<nsCString*>(closure);
353 0 : decodedData->Append(rawSegment, count);
354 0 : *writeCount = count;
355 0 : return NS_OK;
356 : }
357 :
358 : NS_IMETHODIMP
359 0 : nsFeedSniffer::OnDataAvailable(nsIRequest* request, nsISupports* context,
360 : nsIInputStream* stream, uint64_t offset,
361 : uint32_t count)
362 : {
363 : uint32_t read;
364 0 : return stream->ReadSegments(AppendSegmentToString, &mDecodedData, count,
365 0 : &read);
366 : }
367 :
368 : NS_IMETHODIMP
369 0 : nsFeedSniffer::OnStopRequest(nsIRequest* request, nsISupports* context,
370 : nsresult status)
371 : {
372 0 : return NS_OK;
373 : }
|