Skip to content

Commit

Permalink
Fixed a bug which caused XBRL document instances hosted on www.sec.gov
Browse files Browse the repository at this point in the history
…to report schema load errors and report that no facts are available in the instance. This fixes No Facts Found In Loaded Document #56
  • Loading branch information
JeffFerguson committed Dec 31, 2024
1 parent c381813 commit 5220894
Show file tree
Hide file tree
Showing 10 changed files with 267 additions and 137 deletions.
12 changes: 12 additions & 0 deletions JeffFerguson.Gepsio.Test/IssueTests/SingleMethodIssueTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -227,5 +227,17 @@ public void VerifyFixForIssue57()
var elementWithoutDefinedId = firstFragment.Schemas.GetElement("explicitMember");
var hashCode = elementWithoutDefinedId.GetHashCode();
}

/// <summary>
/// Verify that at least one fact is available in the loaded document.
/// </summary>
[TestMethod]
public void VerifyFixForIssue56()
{
var xbrlDoc = new XbrlDocument();
xbrlDoc.Load("https://www.sec.gov/Archives/edgar/data/1688568/000168856818000036/csc-20170331.xml");
var firstFragment = xbrlDoc.XbrlFragments[0];
Assert.IsTrue(firstFragment.Facts.Count > 0);
}
}
}
10 changes: 9 additions & 1 deletion JeffFerguson.Gepsio/LinkbaseDocument.cs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,15 @@ internal LinkbaseDocument(string ContainingDocumentUri, string DocumentPath, Xbr
{
thisLinkbasePath = GetFullLinkbasePath(ContainingDocumentUri, DocumentPath);
thisXmlDocument = Container.Resolve<IDocument>();
thisXmlDocument.Load(thisLinkbasePath);
if(SecContent.IsSecUri(thisLinkbasePath) == true)
{
var documentStream = SecContent.GetStream(thisLinkbasePath);
thisXmlDocument.Load(documentStream);
}
else
{
thisXmlDocument.Load(thisLinkbasePath);
}
thisNamespaceManager = Container.Resolve<INamespaceManager>();
thisNamespaceManager.Document = thisXmlDocument;
thisNamespaceManager.AddNamespace("default", XbrlDocument.XbrlLinkbaseNamespaceUri);
Expand Down
101 changes: 101 additions & 0 deletions JeffFerguson.Gepsio/SecContent.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
using System;
using System.IO;
using System.Net.Http;
using System.Net;
using System.Threading.Tasks;

namespace JeffFerguson.Gepsio
{
/// <summary>
/// Static methods supporting the capture of content from the SEC Web site.
/// </summary>
/// <remarks>
/// The SEC Web site does not allow code to scrape documents from the site
/// without supplying appropriate HTTP headers. Without the correct HTTP
/// headers, simply calling XDocument.Load() for a document stored at the
/// SEC Web site will fail, most likely with an HTTP 403 error code. Since
/// Gepsio contains unit tests that reference documents stored at the SEC Web
/// site, support for SEC-compatible HTTP headers is necessary. See documentation
/// at https://www.sec.gov/os/accessing-edgar-data for more information.
/// </remarks>
static internal class SecContent
{
private static string UserAgentValue = "Gepsio gepsioxbrl@outlook.com";
private static string AcceptEncodingValue = "gzip, deflate";
private static string HostValue = "www.sec.gov";

/// <summary>
/// Determines whether or not a URI references the SEC Web site.
/// </summary>
/// <param name="uriPath">
/// The URI to check.
/// </param>
/// <returns>
/// True if the supplied URI is an SEC URI; false otherwise.
/// </returns>
static internal bool IsSecUri(string uriPath)
{
try
{
var uriToInspect = new Uri(uriPath);
return uriToInspect.Host.ToLower().Trim().Equals("www.sec.gov");
}
catch (UriFormatException)
{
return false;
}
}

/// <summary>
/// Synchronously gets a stream of data sourced from content hosted by the SEC.
/// </summary>
/// <param name="uriPath">
/// The URI to the hosted content.
/// </param>
/// <returns>
/// A stream of data from the hosted content.
/// </returns>
static internal Stream GetStream(string uriPath)
{
using (var request = new HttpRequestMessage(HttpMethod.Get, uriPath))
{
var httpClientHandler = new HttpClientHandler() { AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate };
using (var httpClient = new HttpClient(httpClientHandler))
{
request.Headers.TryAddWithoutValidation("User-Agent", UserAgentValue);
request.Headers.Add("Accept-Encoding", AcceptEncodingValue);
request.Headers.Add("Host", HostValue);
var response = httpClient.Send(request);
response.EnsureSuccessStatusCode();
return response.Content.ReadAsStream();
}
}
}

/// <summary>
/// Asynchronously gets a stream of data sourced from content hosted by the SEC.
/// </summary>
/// <param name="uriPath">
/// The URI to the hosted content.
/// </param>
/// <returns>
/// A stream of data from the hosted content.
/// </returns>
static internal async Task<Stream> GetStreamAsync(string uriPath)
{
using (var request = new HttpRequestMessage(HttpMethod.Get, uriPath))
{
var httpClientHandler = new HttpClientHandler() { AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate };
using (var httpClient = new HttpClient(httpClientHandler))
{
request.Headers.TryAddWithoutValidation("User-Agent", UserAgentValue);
request.Headers.Add("Accept-Encoding", AcceptEncodingValue);
request.Headers.Add("Host", HostValue);
var response = await httpClient.SendAsync(request);
response.EnsureSuccessStatusCode();
return await response.Content.ReadAsStreamAsync();
}
}
}
}
}
145 changes: 42 additions & 103 deletions JeffFerguson.Gepsio/XbrlDocument.cs
Original file line number Diff line number Diff line change
Expand Up @@ -177,9 +177,10 @@ public XbrlDocument()
/// </param>
public void Load(string Filename)
{
if (IsSecUri(Filename) == true)
if (SecContent.IsSecUri(Filename) == true)
{
LoadFromSec(Filename);
var responseAsStream = SecContent.GetStream(Filename);
Load(responseAsStream, Filename);
}
else
{
Expand All @@ -190,71 +191,6 @@ public void Load(string Filename)
Parse(SchemaValidXbrl);
}
}

/// <summary>
/// Synchronously load a document directly from the SEC Web site.
/// </summary>
/// <remarks>
/// The SEC Web site does not allow code to scrape documents from the site
/// without supplying appropriate HTTP headers. Without the correct HTTP
/// headers, simply calling XDocument.Load() for a document stored at the
/// SEC Web site will fail, most likely with an HTTP 403 error code. Since
/// Gepsio contains unit tests that reference documents stored at the SEC Web
/// site, support for SEC-compatible HTTP headers is necessary. See documentation
/// at https://www.sec.gov/os/accessing-edgar-data for more information.
/// </remarks>
/// <param name="path">
/// The path of the document to load from the SEC Web site.
/// </param>
private void LoadFromSec(string path)
{
using(var request = new HttpRequestMessage(HttpMethod.Get, path))
{
var httpClientHandler = new HttpClientHandler() { AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate };
using (var httpClient = new HttpClient(httpClientHandler))
{
request.Headers.TryAddWithoutValidation("User-Agent", "Gepsio gepsioxbrl@outlook.com");
request.Headers.Add("Accept-Encoding", "gzip, deflate");
request.Headers.Add("Host", "www.sec.gov");
var response = httpClient.Send(request);
response.EnsureSuccessStatusCode();
var responseAsStream = response.Content.ReadAsStream();
Load(responseAsStream);
}
}
}
/// <summary>
/// Asynchronously load a document directly from the SEC Web site.
/// </summary>
/// <remarks>
/// The SEC Web site does not allow code to scrape documents from the site
/// without supplying appropriate HTTP headers. Without the correct HTTP
/// headers, simply calling XDocument.Load() for a document stored at the
/// SEC Web site will fail, most likely with an HTTP 403 error code. Since
/// Gepsio contains unit tests that reference documents stored at the SEC Web
/// site, support for SEC-compatible HTTP headers is necessary. See documentation
/// at https://www.sec.gov/os/accessing-edgar-data for more information.
/// </remarks>
/// <param name="path">
/// The path of the document to load from the SEC Web site.
/// </param>
private async Task LoadFromSecAsync(string path)
{
using(var request = new HttpRequestMessage(HttpMethod.Get, path))
{
var httpClientHandler = new HttpClientHandler() { AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate };
using (var httpClient = new HttpClient(httpClientHandler))
{
request.Headers.TryAddWithoutValidation("User-Agent", "Gepsio gepsioxbrl@outlook.com");
request.Headers.Add("Accept-Encoding", "gzip, deflate");
request.Headers.Add("Host", "www.sec.gov");
var response = await httpClient.SendAsync(request);
response.EnsureSuccessStatusCode();
var responseAsStream = await response.Content.ReadAsStreamAsync();
await LoadAsync(responseAsStream);
}
}
}

/// <summary>
/// Asynchronously loads a local filesystem or Internet-accessible XBRL document containing
Expand All @@ -275,14 +211,10 @@ private async Task LoadFromSecAsync(string path)
/// </param>
public async Task LoadAsync(string Filename)
{
//var SchemaValidXbrl = Container.Resolve<IDocument>();
//await SchemaValidXbrl.LoadAsync(Filename);
//this.Filename = Filename;
//this.Path = System.IO.Path.GetDirectoryName(this.Filename);
//Parse(SchemaValidXbrl);
if (IsSecUri(Filename) == true)
if (SecContent.IsSecUri(Filename) == true)
{
await LoadFromSecAsync(Filename);
var responseAsStream = await SecContent.GetStreamAsync(Filename);
await LoadAsync(responseAsStream, Filename);
}
else
{
Expand Down Expand Up @@ -312,8 +244,9 @@ public async Task LoadAsync(string Filename)
/// newDoc.Load(memStream);
/// </code>
/// <para>
/// Schema references found in streamed XBRL instances must specify an absolute location, and not
/// a relative location. For example, this schema reference is fine:
/// Unless the dataStreamSourceLocation parameter is used, and is not empty, schema references found
/// in streamed XBRL instances must specify an absolute location, and not a relative location. For
/// example, this schema reference is fine:
/// </para>
/// <code>
/// xsi:schemaLocation=http://www.xbrlsolutions.com/taxonomies/iso4217/2002-06-30/iso4217.xsd
Expand Down Expand Up @@ -364,16 +297,32 @@ public async Task LoadAsync(string Filename)
/// XBRL document instances loaded through a stream which use absolute paths for schema references will be
/// valid (assuming that all of the other XBRL semantics in the instance are correct).
/// </para>
/// <para>
/// If the dataStreamSourceLocation parameter is used, and is not NULL or empty, then the source location
/// will be used as the source from which to locate relative schema references.
/// </para>
/// </remarks>
/// <param name="dataStream">
/// A stream of data containing the XML document to load.
/// </param>
public void Load(Stream dataStream)
/// <param name="dataStreamSourceLocation">
/// The location of the source document from which the data stream was loaded. The location can be
/// empty or NULL if the location is not known.
/// </param>
public void Load(Stream dataStream, string dataStreamSourceLocation = null)
{
var SchemaValidXbrl = Container.Resolve<IDocument>();
SchemaValidXbrl.Load(dataStream);
this.Filename = string.Empty;
this.Path = string.Empty;
if (string.IsNullOrEmpty(dataStreamSourceLocation) == true)
{
this.Filename = string.Empty;
this.Path = string.Empty;
}
else
{
this.Filename = dataStreamSourceLocation;
this.Path = System.IO.Path.GetDirectoryName(this.Filename);
}
Parse(SchemaValidXbrl);
}

Expand Down Expand Up @@ -451,35 +400,25 @@ public void Load(Stream dataStream)
/// <param name="dataStream">
/// A stream of data containing the XML document to load.
/// </param>
public async Task LoadAsync(Stream dataStream)
/// <param name="dataStreamSourceLocation">
/// The location of the source document from which the data stream was loaded. The location can be
/// empty or NULL if the location is not known.
/// </param>
public async Task LoadAsync(Stream dataStream, string dataStreamSourceLocation = null)
{
var SchemaValidXbrl = Container.Resolve<IDocument>();
await SchemaValidXbrl.LoadAsync(dataStream);
this.Filename = string.Empty;
this.Path = string.Empty;
Parse(SchemaValidXbrl);
}

/// <summary>
/// Determines whether or not a URI references the SEC Web site.
/// </summary>
/// <param name="uriPath">
/// The URI to check.
/// </param>
/// <returns>
/// True if the supplied URI is an SEC URI; false otherwise.
/// </returns>
private bool IsSecUri(string uriPath)
{
try
if (string.IsNullOrEmpty(dataStreamSourceLocation) == true)
{
var uriToInspect = new Uri(uriPath);
return uriToInspect.Host.ToLower().Trim().Equals("www.sec.gov");
}
catch(UriFormatException)
this.Filename = string.Empty;
this.Path = string.Empty;
}
else
{
return false;
}
this.Filename = dataStreamSourceLocation;
this.Path = System.IO.Path.GetDirectoryName(this.Filename);
}
Parse(SchemaValidXbrl);
}

/// <summary>
Expand Down
30 changes: 25 additions & 5 deletions JeffFerguson.Gepsio/XbrlSchema.cs
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,15 @@ internal XbrlSchema(XbrlFragment ContainingXbrlFragment, string SchemaFilename,
}
thisSchemaDocument = Container.Resolve<IDocument>();
this.RoleTypes = new List<RoleType>();
thisSchemaDocument.Load(this.LoadPath);
if(SecContent.IsSecUri(this.LoadPath) == true)
{
var sourceStream = SecContent.GetStream(this.LoadPath);
thisSchemaDocument.Load(sourceStream);
}
else
{
thisSchemaDocument.Load(this.LoadPath);
}
this.NamespaceManager = Container.Resolve<INamespaceManager>();
this.NamespaceManager.Document = thisSchemaDocument;
this.NamespaceManager.AddNamespace("schema", XbrlSchema.XmlSchemaNamespaceUri);
Expand All @@ -209,7 +217,17 @@ private bool ReadAndCompile(string schemaPath)
{
thisXmlSchema = Container.Resolve<ISchema>();
thisXmlSchemaSet = Container.Resolve<ISchemaSet>();
if (thisXmlSchema.Read(schemaPath) == false)
var readSuccessful = true;
if(SecContent.IsSecUri(schemaPath) == true)
{
var schemaStream = SecContent.GetStream(schemaPath);
readSuccessful = thisXmlSchema.Read(schemaStream, schemaPath);
}
else
{
readSuccessful = thisXmlSchema.Read(schemaPath);
}
if (readSuccessful == false)
{
StringBuilder MessageBuilder = new StringBuilder();
string StringFormat = AssemblyResources.GetName("SchemaFileCandidateDoesNotContainSchemaRootNode");
Expand All @@ -219,9 +237,11 @@ private bool ReadAndCompile(string schemaPath)
}
thisXmlSchemaSet.Add(thisXmlSchema);
thisXmlSchemaSet.Compile();
foreach( var schema in thisXmlSchemaSet.Schemas ) {
foreach( var item in schema.AppInfo ) {
thisLinkbaseDocuments.ReadLinkbaseReferences(schema.SourceUri, item.Markup, null);
foreach(var schema in thisXmlSchemaSet.Schemas)
{
foreach(var item in schema.AppInfo)
{
thisLinkbaseDocuments.ReadLinkbaseReferences(schema.SourceUri, item.Markup, null);
}
}
return true;
Expand Down
Loading

0 comments on commit 5220894

Please sign in to comment.