mirror of
https://github.com/cookiengineer/audacity
synced 2025-05-06 14:52:34 +02:00
1997 lines
62 KiB
C
1997 lines
62 KiB
C
/* -*- Mode: c; c-basic-offset: 2 -*-
|
|
*
|
|
* raptor_grddl.c - Raptor GRDDL (+microformats) Parser implementation
|
|
*
|
|
* Copyright (C) 2005-2008, David Beckett http://purl.org/net/dajobe/
|
|
* Copyright (C) 2005, University of Bristol, UK http://www.bristol.ac.uk/
|
|
*
|
|
* This package is Free Software and part of Redland http://librdf.org/
|
|
*
|
|
* It is licensed under the following three licenses as alternatives:
|
|
* 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
|
|
* 2. GNU General Public License (GPL) V2 or any newer version
|
|
* 3. Apache License, V2.0 or any newer version
|
|
*
|
|
* You may not use this file except in compliance with at least one of
|
|
* the above three licenses.
|
|
*
|
|
* See LICENSE.html or LICENSE.txt at the top of this package for the
|
|
* complete terms and further detail along with the license texts for
|
|
* the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively.
|
|
*
|
|
*/
|
|
|
|
/*
|
|
* Specifications:
|
|
* Gleaning Resource Descriptions from Dialects of Languages (GRDDL)
|
|
* W3C Proposed Recommendation 16 July 2007
|
|
* http://www.w3.org/TR/2007/PR-grddl-20070716/
|
|
* http://www.w3.org/TR/grddl/
|
|
*
|
|
*/
|
|
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include <raptor_config.h>
|
|
#endif
|
|
|
|
#ifdef WIN32
|
|
#include <win32_raptor_config.h>
|
|
#endif
|
|
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <ctype.h>
|
|
#include <stdarg.h>
|
|
#ifdef HAVE_ERRNO_H
|
|
#include <errno.h>
|
|
#endif
|
|
#ifdef HAVE_STDLIB_H
|
|
#include <stdlib.h>
|
|
#endif
|
|
|
|
/* Raptor includes */
|
|
#include "raptor.h"
|
|
#include "raptor_internal.h"
|
|
|
|
#include <libxml/xpath.h>
|
|
/* for xmlXPathRegisterNs() */
|
|
#include <libxml/xpathInternals.h>
|
|
#include <libxml/xinclude.h>
|
|
#include <libxml/HTMLparser.h>
|
|
|
|
#include <libxslt/xslt.h>
|
|
#include <libxslt/transform.h>
|
|
#include <libxslt/xsltutils.h>
|
|
#include <libxslt/security.h>
|
|
|
|
|
|
/*
|
|
* libxslt API notes
|
|
*
|
|
* Inputs to an XSLT transformation process with libxslt are:
|
|
* 1. A set of (key:value) parameters.
|
|
* 2. An xsltStylesheetPtr for the XSLT sheet
|
|
* Which could be made from a file or an xmlDoc; and the xmlDoc.
|
|
* made from a file or memory buffer.
|
|
* 3. An xmlDoc for the XML source
|
|
* Which could be made from a file or a memory buffer.
|
|
*
|
|
*/
|
|
|
|
|
|
static void raptor_grddl_filter_triples(void *user_data, const raptor_statement *statement);
|
|
|
|
static void raptor_libxslt_error_common(raptor_parser* rdf_parser, const char *msg, va_list args, const char *prefix) RAPTOR_PRINTF_FORMAT(2, 0);
|
|
|
|
static void raptor_grddl_xsltGenericError_handler(void *user_data, const char *msg, ...) RAPTOR_PRINTF_FORMAT(2, 0);
|
|
|
|
|
|
typedef struct
|
|
{
|
|
/* transformation (XSLT) or profile URI */
|
|
raptor_uri* uri;
|
|
/* base URI in effect when the above was found */
|
|
raptor_uri* base_uri;
|
|
} grddl_xml_context;
|
|
|
|
|
|
/*
|
|
* XSLT parser object
|
|
*/
|
|
struct raptor_grddl_parser_context_s {
|
|
raptor_parser* rdf_parser;
|
|
|
|
xmlSAXHandler sax;
|
|
|
|
/* HTML document ctxt */
|
|
htmlParserCtxtPtr html_ctxt;
|
|
/* XML document ctxt */
|
|
xmlParserCtxtPtr xml_ctxt;
|
|
|
|
/* Create xpath evaluation context */
|
|
xmlXPathContextPtr xpathCtx;
|
|
|
|
/* parser for dealing with the result */
|
|
raptor_parser* internal_parser;
|
|
/* ... constructed with this name */
|
|
const char* internal_parser_name;
|
|
|
|
/* sax2 structure - only for recording error pointers */
|
|
raptor_sax2* sax2;
|
|
|
|
/* URI of root namespace of document */
|
|
raptor_uri* root_ns_uri;
|
|
|
|
/* List of transformation URIs for document */
|
|
raptor_sequence* doc_transform_uris;
|
|
|
|
/* Copy of the user data statement_handler overwritten to point to
|
|
* raptor_grddl_filter_triples()
|
|
*/
|
|
void* saved_user_data;
|
|
raptor_statement_handler saved_statement_handler;
|
|
|
|
/* URI data-view:namespaceTransformation */
|
|
raptor_uri* namespace_transformation_uri;
|
|
|
|
/* URI data-view:profileTransformation */
|
|
raptor_uri* profile_transformation_uri;
|
|
|
|
/* List of namespace / <head profile> URIs */
|
|
raptor_sequence* profile_uris;
|
|
|
|
/* List of visited URIs */
|
|
raptor_sequence* visited_uris;
|
|
|
|
/* Depth of GRDDL parsers - 0 means that the lists above
|
|
* are owned by this parser: visited_uris
|
|
* */
|
|
int grddl_depth;
|
|
|
|
/* Content-Type of top-level document */
|
|
char* content_type;
|
|
|
|
/* Check content type once */
|
|
int content_type_check;
|
|
|
|
/* stringbuffer to use to store retrieved document */
|
|
raptor_stringbuffer* sb;
|
|
|
|
/* non-0 to perform an additional RDF/XML parse on a retrieved document
|
|
* because it has been identified as RDF/XML. */
|
|
int process_this_as_rdfxml;
|
|
|
|
/* non-0 to perform GRDL processing on document */
|
|
int grddl_processing;
|
|
|
|
/* non-0 to perform XML Include processing on document */
|
|
int xinclude_processing;
|
|
|
|
/* non-0 to perform HTML Base processing on document */
|
|
int html_base_processing;
|
|
|
|
/* non-0 to perform HTML <link> processing on document */
|
|
int html_link_processing;
|
|
};
|
|
|
|
|
|
typedef struct raptor_grddl_parser_context_s raptor_grddl_parser_context;
|
|
|
|
|
|
static void
|
|
raptor_libxslt_error_common(raptor_parser* rdf_parser,
|
|
const char *msg, va_list args,
|
|
const char *prefix)
|
|
{
|
|
int prefix_length=strlen(prefix);
|
|
int length;
|
|
char *nmsg;
|
|
|
|
length=prefix_length+strlen(msg)+1;
|
|
nmsg=(char*)RAPTOR_MALLOC(cstring, length);
|
|
if(nmsg) {
|
|
strcpy(nmsg, prefix);
|
|
strcpy(nmsg+prefix_length, msg);
|
|
if(nmsg[length-1]=='\n')
|
|
nmsg[length-1]='\0';
|
|
}
|
|
|
|
raptor_parser_error_varargs(rdf_parser,
|
|
nmsg ? nmsg : msg,
|
|
args);
|
|
if(nmsg)
|
|
RAPTOR_FREE(cstring,nmsg);
|
|
}
|
|
|
|
|
|
static void
|
|
raptor_grddl_xsltGenericError_handler(void *user_data, const char *msg, ...)
|
|
{
|
|
raptor_parser* rdf_parser=(raptor_parser*)user_data;
|
|
va_list arguments;
|
|
|
|
if(!msg || *msg == '\n')
|
|
return;
|
|
|
|
va_start(arguments, msg);
|
|
raptor_libxslt_error_common(rdf_parser, msg, arguments, "libxslt error: ");
|
|
va_end(arguments);
|
|
}
|
|
|
|
|
|
static grddl_xml_context*
|
|
raptor_new_xml_context(raptor_uri* uri, raptor_uri* base_uri)
|
|
{
|
|
grddl_xml_context* xml_context;
|
|
|
|
xml_context=(grddl_xml_context*)RAPTOR_MALLOC(xml_context, sizeof(grddl_xml_context));
|
|
if(uri)
|
|
uri=raptor_uri_copy(uri);
|
|
if(base_uri)
|
|
base_uri=raptor_uri_copy(base_uri);
|
|
xml_context->uri=uri;
|
|
xml_context->base_uri=base_uri;
|
|
|
|
return xml_context;
|
|
}
|
|
|
|
|
|
static void
|
|
grddl_free_xml_context(void* userdata)
|
|
{
|
|
grddl_xml_context* xml_context=(grddl_xml_context*)userdata;
|
|
|
|
if(xml_context->uri)
|
|
raptor_free_uri(xml_context->uri);
|
|
if(xml_context->base_uri)
|
|
raptor_free_uri(xml_context->base_uri);
|
|
RAPTOR_FREE(grddl_xml_context, xml_context);
|
|
}
|
|
|
|
|
|
static int
|
|
raptor_grddl_parse_init_common(raptor_parser* rdf_parser, const char *name)
|
|
{
|
|
raptor_grddl_parser_context* grddl_parser=(raptor_grddl_parser_context*)rdf_parser->context;
|
|
|
|
grddl_parser->rdf_parser=rdf_parser;
|
|
|
|
/* sax2 structure - only for recording error pointers */
|
|
grddl_parser->sax2=raptor_new_sax2(rdf_parser, &rdf_parser->error_handlers);
|
|
|
|
/* The following error fields are normally initialised by
|
|
* raptor_libxml_init() via raptor_sax2_parse_start() which is
|
|
* not used here as we go to libxml calls direct.
|
|
*/
|
|
raptor_libxml_init_sax_error_handlers(&grddl_parser->sax);
|
|
|
|
/* Sequence of URIs of XSLT sheets to transform the document */
|
|
grddl_parser->doc_transform_uris=raptor_new_sequence((raptor_sequence_free_handler*)grddl_free_xml_context, NULL);
|
|
|
|
grddl_parser->grddl_processing=1;
|
|
grddl_parser->xinclude_processing=1;
|
|
grddl_parser->html_base_processing=0;
|
|
grddl_parser->html_link_processing=1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
static int
|
|
raptor_grddl_parse_init(raptor_parser* rdf_parser, const char *name)
|
|
{
|
|
raptor_grddl_parser_context* grddl_parser=(raptor_grddl_parser_context*)rdf_parser->context;
|
|
|
|
raptor_grddl_parse_init_common(rdf_parser, name);
|
|
|
|
/* Sequence of URIs from <head profile> */
|
|
grddl_parser->profile_uris=raptor_new_sequence((raptor_sequence_free_handler*)grddl_free_xml_context, NULL);
|
|
|
|
grddl_parser->namespace_transformation_uri=raptor_new_uri((const unsigned char*)"http://www.w3.org/2003/g/data-view#namespaceTransformation");
|
|
grddl_parser->profile_transformation_uri=raptor_new_uri((const unsigned char*)"http://www.w3.org/2003/g/data-view#profileTransformation");
|
|
|
|
/* Sequence of URIs visited - may be overwritten if this is not
|
|
* the depth 0 grddl parser
|
|
*/
|
|
grddl_parser->visited_uris=raptor_new_sequence((raptor_sequence_free_handler*)raptor_free_uri, (raptor_sequence_print_handler*)raptor_sequence_print_uri);
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
static void
|
|
raptor_grddl_parse_terminate(raptor_parser *rdf_parser)
|
|
{
|
|
raptor_grddl_parser_context *grddl_parser=(raptor_grddl_parser_context*)rdf_parser->context;
|
|
if(grddl_parser->xml_ctxt) {
|
|
if(grddl_parser->xml_ctxt->myDoc) {
|
|
xmlFreeDoc(grddl_parser->xml_ctxt->myDoc);
|
|
grddl_parser->xml_ctxt->myDoc=NULL;
|
|
}
|
|
xmlFreeParserCtxt(grddl_parser->xml_ctxt);
|
|
}
|
|
|
|
if(grddl_parser->html_ctxt) {
|
|
if(grddl_parser->html_ctxt->myDoc) {
|
|
xmlFreeDoc(grddl_parser->html_ctxt->myDoc);
|
|
grddl_parser->html_ctxt->myDoc=NULL;
|
|
}
|
|
htmlFreeParserCtxt(grddl_parser->html_ctxt);
|
|
}
|
|
|
|
if(grddl_parser->xpathCtx)
|
|
xmlXPathFreeContext(grddl_parser->xpathCtx);
|
|
|
|
if(grddl_parser->internal_parser)
|
|
raptor_free_parser(grddl_parser->internal_parser);
|
|
|
|
if(grddl_parser->sax2)
|
|
raptor_free_sax2(grddl_parser->sax2);
|
|
|
|
if(grddl_parser->root_ns_uri)
|
|
raptor_free_uri(grddl_parser->root_ns_uri);
|
|
|
|
if(grddl_parser->doc_transform_uris)
|
|
raptor_free_sequence(grddl_parser->doc_transform_uris);
|
|
|
|
if(grddl_parser->profile_uris)
|
|
raptor_free_sequence(grddl_parser->profile_uris);
|
|
|
|
if(grddl_parser->namespace_transformation_uri)
|
|
raptor_free_uri(grddl_parser->namespace_transformation_uri);
|
|
|
|
if(grddl_parser->profile_transformation_uri)
|
|
raptor_free_uri(grddl_parser->profile_transformation_uri);
|
|
|
|
if(!grddl_parser->grddl_depth) {
|
|
if(grddl_parser->visited_uris)
|
|
raptor_free_sequence(grddl_parser->visited_uris);
|
|
}
|
|
|
|
if(grddl_parser->content_type)
|
|
RAPTOR_FREE(cstring, grddl_parser->content_type);
|
|
|
|
if(grddl_parser->sb)
|
|
raptor_free_stringbuffer(grddl_parser->sb);
|
|
}
|
|
|
|
|
|
static void
|
|
raptor_grddl_parser_add_parent(raptor_parser *rdf_parser,
|
|
raptor_grddl_parser_context* parent_grddl_parser)
|
|
{
|
|
raptor_grddl_parser_context* grddl_parser=(raptor_grddl_parser_context*)rdf_parser->context;
|
|
|
|
/* Do not set parent twice */
|
|
if(grddl_parser->visited_uris == parent_grddl_parser->visited_uris)
|
|
return;
|
|
|
|
/* free any sequence here */
|
|
if(grddl_parser->visited_uris)
|
|
raptor_free_sequence(grddl_parser->visited_uris);
|
|
|
|
/* share parent's list and do not free it here */
|
|
grddl_parser->visited_uris= parent_grddl_parser->visited_uris;
|
|
grddl_parser->grddl_depth= parent_grddl_parser->grddl_depth+1;
|
|
|
|
grddl_parser->saved_user_data= parent_grddl_parser->rdf_parser;
|
|
grddl_parser->saved_statement_handler= raptor_grddl_filter_triples;
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
raptor_grddl_parse_start(raptor_parser *rdf_parser)
|
|
{
|
|
raptor_grddl_parser_context* grddl_parser=(raptor_grddl_parser_context*)rdf_parser->context;
|
|
raptor_locator *locator=&rdf_parser->locator;
|
|
|
|
locator->line=1;
|
|
|
|
grddl_parser->content_type_check=0;
|
|
grddl_parser->process_this_as_rdfxml=0;
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
#define MATCH_IS_VALUE_LIST 1
|
|
#define MATCH_IS_PROFILE 2
|
|
#define MATCH_IS_HARDCODED 4
|
|
/* stop looking for other hardcoded matches */
|
|
#define MATCH_LAST 8
|
|
static struct {
|
|
const xmlChar* xpath;
|
|
int flags;
|
|
const xmlChar* xslt_sheet_uri;
|
|
} match_table[]={
|
|
/* XHTML document where the GRDDL profile is in
|
|
* <link ref='transform' href='url'> inside the html <head>
|
|
* Value of @rel is a space-separated list of link types.
|
|
*/
|
|
{
|
|
(const xmlChar*)"/html:html/html:head[contains(@profile,\"http://www.w3.org/2003/g/data-view\")]/html:link[contains(@rel,\"transformation\")]/@href",
|
|
0,
|
|
NULL
|
|
}
|
|
,
|
|
/* XHTML document where the GRDDL profile is in
|
|
* <a rel='transform' href='url'> inside the html <body>
|
|
* Value of @rel is a space-separated list of link types.
|
|
*/
|
|
{
|
|
(const xmlChar*)"/html:html/html:head[contains(@profile,\"http://www.w3.org/2003/g/data-view\")]/../..//html:a[contains(@rel,\"transformation\")]/@href",
|
|
0,
|
|
NULL
|
|
}
|
|
,
|
|
/* XML document linking to transform via attribute dataview:transformation
|
|
* on the root element.
|
|
* Example: http://www.w3.org/2004/01/rdxh/grddl-p3p-example
|
|
**/
|
|
{
|
|
(const xmlChar*)"/*/@dataview:transformation",
|
|
MATCH_IS_VALUE_LIST,
|
|
NULL
|
|
}
|
|
,
|
|
/* hCalendar microformat http://microformats.org/wiki/hcalendar */
|
|
{
|
|
(const xmlChar*)"//*[contains(concat(' ', concat(normalize-space(@class),' ')),' vevent ')]",
|
|
MATCH_IS_HARDCODED,
|
|
(const xmlChar*)"http://www.w3.org/2002/12/cal/glean-hcal.xsl"
|
|
}
|
|
,
|
|
/* hReview microformat http://microformats.org/wiki/review */
|
|
{
|
|
(const xmlChar*)"//*[contains(concat(' ', concat(normalize-space(@class),' ')),' hreview ')]",
|
|
MATCH_IS_HARDCODED | MATCH_LAST, /* stop here since hCard is inside hReview */
|
|
(const xmlChar*)"http://www.w3.org/2001/sw/grddl-wg/doc29/hreview2rdfxml.xsl"
|
|
}
|
|
,
|
|
/* hCard microformat http://microformats.org/wiki/hcard */
|
|
{
|
|
(const xmlChar*)"//*[contains(concat(' ', concat(normalize-space(@class),' ')),' vcard ')]",
|
|
MATCH_IS_HARDCODED,
|
|
(const xmlChar*)"http://www.w3.org/2006/vcard/hcard2rdf.xsl"
|
|
}
|
|
,
|
|
{
|
|
NULL,
|
|
0,
|
|
0
|
|
}
|
|
};
|
|
|
|
|
|
static const char* grddl_namespace_uris_ignore_list[]={
|
|
"http://www.w3.org/1999/xhtml",
|
|
"http://www.w3.org/1999/02/22-rdf-syntax-ns#",
|
|
"http://www.w3.org/2001/XMLSchema",
|
|
NULL
|
|
};
|
|
|
|
|
|
/* add URI to XSLT transformation URI list */
|
|
static void
|
|
raptor_grddl_add_transform_xml_context(raptor_grddl_parser_context* grddl_parser,
|
|
grddl_xml_context* xml_context)
|
|
{
|
|
int i;
|
|
raptor_uri* uri=xml_context->uri;
|
|
|
|
#if RAPTOR_DEBUG > 1
|
|
RAPTOR_DEBUG2("Found document transformation URI '%s'\n",
|
|
raptor_uri_as_string(uri));
|
|
#endif
|
|
|
|
for(i=0; i < raptor_sequence_size(grddl_parser->doc_transform_uris); i++) {
|
|
grddl_xml_context* xc=(grddl_xml_context*)raptor_sequence_get_at(grddl_parser->doc_transform_uris, i);
|
|
if(raptor_uri_equals(uri, xc->uri)) {
|
|
#if RAPTOR_DEBUG > 1
|
|
RAPTOR_DEBUG2("Already seen XSLT URI '%s'\n", raptor_uri_as_string(uri));
|
|
#endif
|
|
grddl_free_xml_context(xml_context);
|
|
return;
|
|
}
|
|
}
|
|
|
|
RAPTOR_DEBUG3("Adding new document transformation XSLT URI %s with base URI %s\n",
|
|
(uri ? (const char*)raptor_uri_as_string(uri): "(NONE)"),
|
|
(xml_context->base_uri ? (const char*)raptor_uri_as_string(xml_context->base_uri) : "(NONE)"));
|
|
|
|
raptor_sequence_push(grddl_parser->doc_transform_uris, xml_context);
|
|
}
|
|
|
|
|
|
static void
|
|
raptor_grddl_filter_triples(void *user_data, const raptor_statement *statement)
|
|
{
|
|
raptor_parser* rdf_parser=(raptor_parser*)user_data;
|
|
raptor_grddl_parser_context* grddl_parser;
|
|
int i;
|
|
raptor_uri* predicate_uri;
|
|
|
|
grddl_parser=(raptor_grddl_parser_context*)rdf_parser->context;
|
|
|
|
/* Look for a triple <uri> <uri> <uri> */
|
|
if(!statement->subject_type == RAPTOR_IDENTIFIER_TYPE_RESOURCE ||
|
|
!statement->predicate_type == RAPTOR_IDENTIFIER_TYPE_RESOURCE ||
|
|
!statement->object_type == RAPTOR_IDENTIFIER_TYPE_RESOURCE)
|
|
return;
|
|
|
|
#if RAPTOR_DEBUG > 2
|
|
RAPTOR_DEBUG2("Parser %p: Relaying statement: ", rdf_parser);
|
|
raptor_print_statement(statement, stderr);
|
|
fputc('\n', stderr);
|
|
#endif
|
|
|
|
#if RAPTOR_DEBUG > 1
|
|
RAPTOR_DEBUG3("Parser %p: Checking against %d profile URIs\n",
|
|
rdf_parser, raptor_sequence_size(grddl_parser->profile_uris));
|
|
#endif
|
|
|
|
/* Look for (i=0, root namespace URI)
|
|
* <document-root-element-namespace-URI> data-view:namespaceTransformation ?tr
|
|
* or (i>0, profile URIs)
|
|
* <document-root-element-namespace-URI> data-view:profileTransformation ?tr
|
|
* and then ?tr becomes a new document transformation URI
|
|
*/
|
|
predicate_uri=grddl_parser->namespace_transformation_uri;
|
|
for(i=0; i < raptor_sequence_size(grddl_parser->profile_uris); i++) {
|
|
grddl_xml_context* xml_context=(grddl_xml_context*)raptor_sequence_get_at(grddl_parser->profile_uris, i);
|
|
raptor_uri* profile_uri=xml_context->uri;
|
|
grddl_xml_context* new_xml_context;
|
|
|
|
if(i==1)
|
|
predicate_uri=grddl_parser->profile_transformation_uri;
|
|
|
|
if(!profile_uri)
|
|
continue;
|
|
|
|
if(raptor_uri_equals((raptor_uri*)statement->subject, profile_uri) &&
|
|
raptor_uri_equals((raptor_uri*)statement->predicate, predicate_uri)) {
|
|
raptor_uri* uri=(raptor_uri*)statement->object;
|
|
|
|
#if RAPTOR_DEBUG > 1
|
|
RAPTOR_DEBUG4("Parser %p: Matches profile URI #%d '%s'\n",
|
|
rdf_parser, i, raptor_uri_as_string(profile_uri));
|
|
#endif
|
|
|
|
new_xml_context=raptor_new_xml_context(uri, rdf_parser->base_uri);
|
|
raptor_grddl_add_transform_xml_context(grddl_parser, new_xml_context);
|
|
} else {
|
|
#if RAPTOR_DEBUG > 1
|
|
RAPTOR_DEBUG4("Parser %p: Failed to match profile URI #%d '%s'\n",
|
|
rdf_parser, i, raptor_uri_as_string(profile_uri));
|
|
#endif
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
static int
|
|
raptor_grddl_ensure_internal_parser(raptor_parser* rdf_parser,
|
|
const char* parser_name, int filter)
|
|
{
|
|
raptor_grddl_parser_context* grddl_parser=(raptor_grddl_parser_context*)rdf_parser->context;
|
|
|
|
if(!grddl_parser->internal_parser_name ||
|
|
!strcmp(parser_name, "guess") ||
|
|
strcmp(grddl_parser->internal_parser_name, parser_name)) {
|
|
/* construct a new parser if none in use or not what is required */
|
|
if(grddl_parser->internal_parser) {
|
|
RAPTOR_DEBUG3("Parser %p: Freeing internal %s parser.\n",
|
|
rdf_parser, grddl_parser->internal_parser_name);
|
|
|
|
raptor_free_parser(grddl_parser->internal_parser);
|
|
grddl_parser->internal_parser=NULL;
|
|
grddl_parser->internal_parser_name=NULL;
|
|
}
|
|
|
|
RAPTOR_DEBUG3("Parser %p: Allocating new internal %s parser.\n",
|
|
rdf_parser, parser_name);
|
|
grddl_parser->internal_parser=raptor_new_parser(parser_name);
|
|
if(!grddl_parser->internal_parser) {
|
|
raptor_parser_error(rdf_parser, "Failed to create %s parser",
|
|
parser_name);
|
|
return 1;
|
|
} else {
|
|
/* initialise the new parser with the outer state */
|
|
grddl_parser->internal_parser_name=parser_name;
|
|
raptor_parser_copy_user_state(grddl_parser->internal_parser,
|
|
rdf_parser);
|
|
grddl_parser->saved_user_data=rdf_parser->user_data;
|
|
grddl_parser->saved_statement_handler=rdf_parser->statement_handler;
|
|
}
|
|
}
|
|
|
|
/* Filter the triples for profile/namespace URIs */
|
|
if(filter) {
|
|
grddl_parser->internal_parser->user_data= rdf_parser;
|
|
grddl_parser->internal_parser->statement_handler= raptor_grddl_filter_triples;
|
|
} else {
|
|
grddl_parser->internal_parser->user_data= grddl_parser->saved_user_data;
|
|
grddl_parser->internal_parser->statement_handler= grddl_parser->saved_statement_handler;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
/* Run a GRDDL transform using a pre-parsed XSLT stylesheet already
|
|
* formed into a libxml document (with URI)
|
|
*/
|
|
static int
|
|
raptor_grddl_run_grddl_transform_doc(raptor_parser* rdf_parser,
|
|
grddl_xml_context* xml_context,
|
|
xmlDocPtr xslt_doc,
|
|
xmlDocPtr doc)
|
|
{
|
|
raptor_grddl_parser_context* grddl_parser=(raptor_grddl_parser_context*)rdf_parser->context;
|
|
int ret=0;
|
|
xsltStylesheetPtr sheet=NULL;
|
|
xmlDocPtr res=NULL;
|
|
xmlChar *doc_txt=NULL;
|
|
int doc_txt_len=0;
|
|
const char* parser_name;
|
|
const char* params[7];
|
|
const unsigned char* base_uri_string;
|
|
size_t base_uri_len;
|
|
raptor_uri* xslt_uri;
|
|
raptor_uri* base_uri;
|
|
char *quoted_base_uri=NULL;
|
|
|
|
xslt_uri=xml_context->uri;
|
|
base_uri=xml_context->base_uri ? xml_context->base_uri : xml_context->uri;
|
|
|
|
base_uri_string=raptor_uri_as_counted_string(base_uri, &base_uri_len);
|
|
|
|
RAPTOR_DEBUG3("Running GRDDL transform with XSLT URI '%s' with doc base URI '%s'\n",
|
|
raptor_uri_as_string(xslt_uri),
|
|
base_uri_string);
|
|
|
|
sheet = xsltParseStylesheetDoc(xslt_doc);
|
|
if(!sheet) {
|
|
raptor_parser_error(rdf_parser, "Failed to parse stylesheet in '%s'",
|
|
raptor_uri_as_string(xslt_uri));
|
|
ret=1;
|
|
goto cleanup_xslt;
|
|
}
|
|
|
|
xsltSetGenericErrorFunc(rdf_parser, raptor_grddl_xsltGenericError_handler);
|
|
|
|
#if 1
|
|
/* FIXME:
|
|
* Define 'base', 'Base' and 'url' params to allow some XSLT sheets to work:
|
|
* base:
|
|
* http://www.w3.org/2000/07/uri43/uri.xsl
|
|
* Base:
|
|
* http://www.w3.org/2000/08/w3c-synd/home2rss.xsl
|
|
* url: (optional)
|
|
* http://www.w3.org/2001/sw/grddl-wg/td/RDFa2RDFXML.xsl
|
|
*/
|
|
quoted_base_uri=(char*)RAPTOR_MALLOC(cstring, base_uri_len+3);
|
|
quoted_base_uri[0]='\'';
|
|
strncpy(quoted_base_uri+1, (const char*)base_uri_string, base_uri_len);
|
|
quoted_base_uri[base_uri_len+1]='\'';
|
|
quoted_base_uri[base_uri_len+2]='\0';
|
|
|
|
params[0]="base";
|
|
params[1]=(const char*)quoted_base_uri;
|
|
params[2]="Base";
|
|
params[3]=(const char*)quoted_base_uri;
|
|
params[4]="url";
|
|
params[5]=(const char*)quoted_base_uri;
|
|
params[6]=NULL;
|
|
|
|
res = xsltApplyStylesheet(sheet, doc, params);
|
|
#else
|
|
res = xsltApplyStylesheet(sheet, doc, NULL); /* No params */
|
|
#endif
|
|
if(!res) {
|
|
raptor_parser_error(rdf_parser, "Failed to apply stylesheet in '%s'",
|
|
raptor_uri_as_string(xslt_uri));
|
|
ret=1;
|
|
goto cleanup_xslt;
|
|
}
|
|
|
|
if(res->type == XML_HTML_DOCUMENT_NODE) {
|
|
if(sheet->method != NULL)
|
|
xmlFree(sheet->method);
|
|
sheet->method = (xmlChar*)xmlMalloc(5);
|
|
strncpy((char*)sheet->method, "html", 5);
|
|
}
|
|
|
|
/* write the resulting XML to a string */
|
|
xsltSaveResultToString(&doc_txt, &doc_txt_len, res, sheet);
|
|
|
|
if(!doc_txt || !doc_txt_len) {
|
|
/* FIXME: continue with an empty document? */
|
|
raptor_parser_warning(rdf_parser, "XSLT returned an empty document");
|
|
goto cleanup_xslt;
|
|
}
|
|
|
|
RAPTOR_DEBUG4("XSLT returned %d bytes document method %s media type %s\n",
|
|
doc_txt_len,
|
|
(sheet->method ? (const char*)sheet->method : "NULL"),
|
|
(sheet->mediaType ? (const char*)sheet->mediaType : "NULL"));
|
|
|
|
/* FIXME: Assumes mime types for XSLT <xsl:output method> */
|
|
if(sheet->mediaType == NULL && sheet->method) {
|
|
if(!(strcmp((const char*)sheet->method, "text"))) {
|
|
sheet->mediaType = (xmlChar*)xmlMalloc(11);
|
|
strncpy((char*)sheet->mediaType, "text/plain",11);
|
|
} else if(!(strcmp((const char*)sheet->method, "xml"))) {
|
|
sheet->mediaType = (xmlChar*)xmlMalloc(16);
|
|
strncpy((char*)sheet->mediaType, "application/xml",16);
|
|
} else if(!(strcmp((const char*)sheet->method, "html"))) {
|
|
sheet->mediaType = (xmlChar*)xmlMalloc(10);
|
|
/* FIXME: use xhtml mime type? */
|
|
strncpy((char*)sheet->mediaType, "text/html",10);
|
|
}
|
|
}
|
|
|
|
/* FIXME: Assume all that all media XML is RDF/XML and also that
|
|
* with no information at all we have RDF/XML
|
|
*/
|
|
if(!sheet->mediaType ||
|
|
(sheet->mediaType &&
|
|
!strcmp((const char*)sheet->mediaType, "application/xml"))) {
|
|
if(sheet->mediaType)
|
|
xmlFree(sheet->mediaType);
|
|
sheet->mediaType = (xmlChar*)xmlMalloc(20);
|
|
strncpy((char*)sheet->mediaType, "application/rdf+xml",20);
|
|
}
|
|
|
|
parser_name=raptor_guess_parser_name(NULL, (const char*)sheet->mediaType,
|
|
doc_txt, doc_txt_len, NULL);
|
|
if(!parser_name) {
|
|
RAPTOR_DEBUG3("Parser %p: Guessed no parser from mime type '%s' and content - ending",
|
|
rdf_parser, sheet->mediaType);
|
|
goto cleanup_xslt;
|
|
}
|
|
|
|
RAPTOR_DEBUG4("Parser %p: Guessed parser %s from mime type '%s' and content\n",
|
|
rdf_parser, parser_name, sheet->mediaType);
|
|
|
|
if(!strcmp((const char*)parser_name, "grddl")) {
|
|
RAPTOR_DEBUG2("Parser %p: Ignoring guess to run grddl parser - ending",
|
|
rdf_parser);
|
|
goto cleanup_xslt;
|
|
}
|
|
|
|
ret=raptor_grddl_ensure_internal_parser(rdf_parser, parser_name, 0);
|
|
if(ret)
|
|
goto cleanup_xslt;
|
|
|
|
if(grddl_parser->internal_parser) {
|
|
grddl_parser->internal_parser->default_generate_id_handler_base=
|
|
raptor_parser_get_current_base_id(rdf_parser);
|
|
|
|
/* generate the triples */
|
|
raptor_start_parse(grddl_parser->internal_parser, base_uri);
|
|
raptor_parse_chunk(grddl_parser->internal_parser, doc_txt, doc_txt_len, 1);
|
|
|
|
rdf_parser->default_generate_id_handler_base=
|
|
raptor_parser_get_current_base_id(grddl_parser->internal_parser);
|
|
}
|
|
|
|
cleanup_xslt:
|
|
|
|
if(quoted_base_uri)
|
|
RAPTOR_FREE(cstring, quoted_base_uri);
|
|
|
|
if(doc_txt)
|
|
xmlFree(doc_txt);
|
|
|
|
if(res)
|
|
xmlFreeDoc(res);
|
|
|
|
if(sheet)
|
|
xsltFreeStylesheet(sheet);
|
|
|
|
return ret;
|
|
}
|
|
|
|
|
|
typedef struct
|
|
{
|
|
raptor_parser* rdf_parser;
|
|
xmlParserCtxtPtr xc;
|
|
raptor_uri* base_uri;
|
|
} raptor_grddl_xml_parse_bytes_context;
|
|
|
|
|
|
static void
|
|
raptor_grddl_uri_xml_parse_bytes(raptor_www* www,
|
|
void *userdata,
|
|
const void *ptr, size_t size, size_t nmemb)
|
|
{
|
|
raptor_grddl_xml_parse_bytes_context* xpbc=(raptor_grddl_xml_parse_bytes_context*)userdata;
|
|
int len=size*nmemb;
|
|
int rc=0;
|
|
|
|
if(!xpbc->xc) {
|
|
xmlParserCtxtPtr xc;
|
|
|
|
xc = xmlCreatePushParserCtxt(NULL, NULL,
|
|
(const char*)ptr, len,
|
|
(const char*)raptor_uri_as_string(xpbc->base_uri));
|
|
if(!xc)
|
|
rc=1;
|
|
else {
|
|
int libxml_options = 0;
|
|
|
|
#ifdef RAPTOR_LIBXML_XML_PARSE_NONET
|
|
if(xpbc->rdf_parser->features[RAPTOR_FEATURE_NO_NET])
|
|
libxml_options |= XML_PARSE_NONET;
|
|
#endif
|
|
#ifdef HAVE_XMLCTXTUSEOPTIONS
|
|
xmlCtxtUseOptions(xc, libxml_options);
|
|
#endif
|
|
|
|
xc->replaceEntities = 1;
|
|
xc->loadsubset = 1;
|
|
}
|
|
xpbc->xc=xc;
|
|
} else
|
|
rc=xmlParseChunk(xpbc->xc, (const char*)ptr, len, 0);
|
|
|
|
if(rc)
|
|
raptor_parser_error(xpbc->rdf_parser, "XML Parsing failed");
|
|
}
|
|
|
|
|
|
static void
|
|
raptor_grddl_discard_message(void *user_data, raptor_locator* locator,
|
|
const char *message)
|
|
{
|
|
RAPTOR_DEBUG3("%s: Discarded error message: %s\n",
|
|
raptor_uri_as_string(locator->uri), message);
|
|
return;
|
|
}
|
|
|
|
|
|
#define FETCH_IGNORE_ERRORS 1
|
|
#define FETCH_ACCEPT_XSLT 2
|
|
|
|
static int
|
|
raptor_grddl_fetch_uri(raptor_parser* rdf_parser,
|
|
raptor_uri* uri,
|
|
raptor_www_write_bytes_handler write_bytes_handler,
|
|
void* write_bytes_user_data,
|
|
raptor_www_content_type_handler content_type_handler,
|
|
void* content_type_user_data,
|
|
int flags)
|
|
{
|
|
raptor_www *www;
|
|
const char *accept_h;
|
|
int ret=0;
|
|
int ignore_errors=(flags & FETCH_IGNORE_ERRORS);
|
|
|
|
if(rdf_parser->features[RAPTOR_FEATURE_NO_NET]) {
|
|
if(!raptor_uri_uri_string_is_file_uri(raptor_uri_as_string(uri)))
|
|
return 1;
|
|
}
|
|
|
|
www=raptor_www_new();
|
|
if(!www)
|
|
return 1;
|
|
|
|
raptor_www_set_user_agent(www, "grddl/0.1");
|
|
|
|
if(flags & FETCH_ACCEPT_XSLT) {
|
|
raptor_www_set_http_accept(www, "application/xml");
|
|
} else {
|
|
accept_h=raptor_parser_get_accept_header(rdf_parser);
|
|
if(accept_h) {
|
|
raptor_www_set_http_accept(www, accept_h);
|
|
RAPTOR_FREE(cstring, accept_h);
|
|
}
|
|
}
|
|
if(rdf_parser->uri_filter)
|
|
raptor_www_set_uri_filter(www, rdf_parser->uri_filter,
|
|
rdf_parser->uri_filter_user_data);
|
|
if(ignore_errors)
|
|
raptor_www_set_error_handler(www, raptor_grddl_discard_message, NULL);
|
|
else
|
|
raptor_www_set_error_handler(www,
|
|
rdf_parser->error_handlers.handlers[RAPTOR_LOG_LEVEL_ERROR].handler,
|
|
rdf_parser->error_handlers.handlers[RAPTOR_LOG_LEVEL_ERROR].user_data);
|
|
raptor_www_set_write_bytes_handler(www, write_bytes_handler,
|
|
write_bytes_user_data);
|
|
raptor_www_set_content_type_handler(www, content_type_handler,
|
|
content_type_user_data);
|
|
|
|
if(rdf_parser->features[RAPTOR_FEATURE_WWW_TIMEOUT] > 0)
|
|
raptor_www_set_connection_timeout(www,
|
|
rdf_parser->features[RAPTOR_FEATURE_WWW_TIMEOUT]);
|
|
|
|
ret=raptor_www_fetch(www, uri);
|
|
|
|
raptor_www_free(www);
|
|
|
|
return ret;
|
|
}
|
|
|
|
|
|
/* Run a GRDDL transform using a XSLT stylesheet at a given URI */
|
|
static int
|
|
raptor_grddl_run_grddl_transform_uri(raptor_parser* rdf_parser,
|
|
grddl_xml_context* xml_context,
|
|
xmlDocPtr doc)
|
|
{
|
|
raptor_grddl_parser_context* grddl_parser;
|
|
xmlParserCtxtPtr xslt_ctxt=NULL;
|
|
raptor_grddl_xml_parse_bytes_context xpbc;
|
|
int ret=0;
|
|
raptor_uri* xslt_uri;
|
|
raptor_uri* base_uri;
|
|
raptor_uri* old_locator_uri;
|
|
raptor_locator *locator=&rdf_parser->locator;
|
|
|
|
xslt_uri=xml_context->uri;
|
|
base_uri=xml_context->base_uri ? xml_context->base_uri : xml_context->uri;
|
|
|
|
grddl_parser=(raptor_grddl_parser_context*)rdf_parser->context;
|
|
|
|
RAPTOR_DEBUG3("Running GRDDL transform with XSLT URI %s and base URI %s\n",
|
|
raptor_uri_as_string(xslt_uri),
|
|
raptor_uri_as_string(base_uri));
|
|
|
|
/* make an xsltStylesheetPtr via the raptor_grddl_uri_xml_parse_bytes
|
|
* callback as bytes are returned
|
|
*/
|
|
xpbc.xc=NULL;
|
|
xpbc.rdf_parser=rdf_parser;
|
|
xpbc.base_uri=xslt_uri;
|
|
|
|
old_locator_uri=locator->uri;
|
|
locator->uri=xslt_uri;
|
|
ret=raptor_grddl_fetch_uri(rdf_parser,
|
|
xslt_uri,
|
|
raptor_grddl_uri_xml_parse_bytes, &xpbc,
|
|
NULL, NULL,
|
|
FETCH_ACCEPT_XSLT);
|
|
if(ret) {
|
|
locator->uri=old_locator_uri;
|
|
raptor_parser_warning(rdf_parser, "Fetching XSLT document URI '%s' failed",
|
|
raptor_uri_as_string(xslt_uri));
|
|
ret=0;
|
|
} else {
|
|
xslt_ctxt=xpbc.xc;
|
|
xmlParseChunk(xpbc.xc, NULL, 0, 1);
|
|
|
|
ret=raptor_grddl_run_grddl_transform_doc(rdf_parser,
|
|
xml_context,
|
|
xslt_ctxt->myDoc,
|
|
doc);
|
|
locator->uri=old_locator_uri;
|
|
}
|
|
|
|
if(xslt_ctxt)
|
|
xmlFreeParserCtxt(xslt_ctxt);
|
|
|
|
return ret;
|
|
}
|
|
|
|
|
|
static int
|
|
raptor_grddl_seen_uri(raptor_grddl_parser_context* grddl_parser,
|
|
raptor_uri* uri)
|
|
{
|
|
int i;
|
|
int seen=0;
|
|
raptor_sequence* seq=grddl_parser->visited_uris;
|
|
|
|
for(i=0; i < raptor_sequence_size(seq); i++) {
|
|
raptor_uri* vuri=(raptor_uri*)raptor_sequence_get_at(seq, i);
|
|
if(raptor_uri_equals(uri, vuri)) {
|
|
seen=1;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if(seen)
|
|
RAPTOR_DEBUG2("Already seen URI '%s'\n", raptor_uri_as_string(uri));
|
|
|
|
return seen;
|
|
}
|
|
|
|
|
|
static void
|
|
raptor_grddl_done_uri(raptor_grddl_parser_context* grddl_parser,
|
|
raptor_uri* uri)
|
|
{
|
|
if(!grddl_parser->visited_uris)
|
|
return;
|
|
|
|
if(!raptor_grddl_seen_uri(grddl_parser, uri)) {
|
|
raptor_sequence* seq=grddl_parser->visited_uris;
|
|
raptor_sequence_push(seq, raptor_uri_copy(uri));
|
|
}
|
|
}
|
|
|
|
|
|
static raptor_sequence*
|
|
raptor_grddl_run_xpath_match(raptor_parser* rdf_parser,
|
|
xmlDocPtr doc,
|
|
const xmlChar* xpathExpr,
|
|
int flags)
|
|
{
|
|
raptor_grddl_parser_context* grddl_parser;
|
|
/* Evaluate xpath expression */
|
|
xmlXPathObjectPtr xpathObj=NULL;
|
|
raptor_sequence* seq=NULL;
|
|
xmlNodeSetPtr nodes;
|
|
int i;
|
|
|
|
grddl_parser=(raptor_grddl_parser_context*)rdf_parser->context;
|
|
|
|
seq=raptor_new_sequence((raptor_sequence_free_handler*)grddl_free_xml_context, NULL);
|
|
|
|
/* Evaluate xpath expression */
|
|
xpathObj = xmlXPathEvalExpression(xpathExpr,
|
|
grddl_parser->xpathCtx);
|
|
if(!xpathObj) {
|
|
raptor_parser_error(rdf_parser,
|
|
"Unable to evaluate XPath expression \"%s\"",
|
|
xpathExpr);
|
|
raptor_free_sequence(seq); seq=NULL;
|
|
goto cleanup_xpath_match;
|
|
}
|
|
|
|
nodes=xpathObj->nodesetval;
|
|
if(!nodes || xmlXPathNodeSetIsEmpty(nodes)) {
|
|
#if RAPTOR_DEBUG > 1
|
|
RAPTOR_DEBUG3("No match found with XPath expression \"%s\" over '%s'\n",
|
|
xpathExpr, raptor_uri_as_string(rdf_parser->base_uri));
|
|
#endif
|
|
raptor_free_sequence(seq); seq=NULL;
|
|
goto cleanup_xpath_match;
|
|
}
|
|
|
|
#if RAPTOR_DEBUG > 1
|
|
RAPTOR_DEBUG3("Found match with XPath expression \"%s\" over '%s'\n",
|
|
xpathExpr, raptor_uri_as_string(rdf_parser->base_uri));
|
|
#endif
|
|
|
|
for(i=0; i < xmlXPathNodeSetGetLength(nodes); i++) {
|
|
xmlNodePtr node=nodes->nodeTab[i];
|
|
const unsigned char* uri_string=NULL;
|
|
xmlChar *base_uri_string;
|
|
raptor_uri* base_uri=NULL;
|
|
raptor_uri* uri=NULL;
|
|
|
|
if(node->type != XML_ATTRIBUTE_NODE &&
|
|
node->type != XML_ELEMENT_NODE) {
|
|
raptor_parser_error(rdf_parser, "Got unexpected node type %d",
|
|
node->type);
|
|
continue;
|
|
}
|
|
|
|
|
|
/* xmlNodeGetBase() returns base URI or NULL and must be freed
|
|
* with xmlFree()
|
|
*/
|
|
if(grddl_parser->html_base_processing) {
|
|
xmlElementType savedType=doc->type;
|
|
doc->type=XML_HTML_DOCUMENT_NODE;
|
|
base_uri_string=xmlNodeGetBase(doc, node);
|
|
doc->type=savedType;
|
|
} else
|
|
base_uri_string=xmlNodeGetBase(doc, node);
|
|
|
|
|
|
if(node->type == XML_ATTRIBUTE_NODE)
|
|
uri_string=(const unsigned char*)node->children->content;
|
|
else { /* XML_ELEMENT_NODE */
|
|
if(node->ns)
|
|
uri_string=(const unsigned char*)node->ns->href;
|
|
}
|
|
|
|
|
|
if(base_uri_string) {
|
|
base_uri=raptor_new_uri(base_uri_string);
|
|
xmlFree(base_uri_string);
|
|
#if RAPTOR_DEBUG > 1
|
|
RAPTOR_DEBUG2("XML base URI of match is '%s'\n", raptor_uri_as_string(base_uri));
|
|
#endif
|
|
} else if(rdf_parser->base_uri)
|
|
base_uri=raptor_uri_copy(rdf_parser->base_uri);
|
|
else
|
|
base_uri=NULL;
|
|
|
|
if(flags & MATCH_IS_VALUE_LIST) {
|
|
char *start;
|
|
char *end;
|
|
char* buffer;
|
|
size_t list_len=strlen((const char*)uri_string);
|
|
|
|
buffer=(char*)RAPTOR_MALLOC(cstring, list_len+1);
|
|
strncpy(buffer, (const char*)uri_string, list_len+1);
|
|
|
|
for(start=end=buffer; end; start=end+1) {
|
|
grddl_xml_context* xml_context;
|
|
|
|
end=strchr(start, ' ');
|
|
if(end)
|
|
*end='\0';
|
|
|
|
if(start == end)
|
|
continue;
|
|
|
|
#if RAPTOR_DEBUG
|
|
RAPTOR_DEBUG2("Got list match URI '%s'\n", start);
|
|
#endif
|
|
|
|
uri=raptor_new_uri_relative_to_base(base_uri,
|
|
(const unsigned char*)start);
|
|
if(flags & MATCH_IS_PROFILE &&
|
|
!strcmp((const char*)raptor_uri_as_string(uri),
|
|
"http://www.w3.org/2003/g/data-view'")) {
|
|
raptor_free_uri(uri);
|
|
continue;
|
|
}
|
|
|
|
xml_context=raptor_new_xml_context(uri, base_uri);
|
|
raptor_sequence_push(seq, xml_context);
|
|
}
|
|
RAPTOR_FREE(cstring, buffer);
|
|
} else if (flags & MATCH_IS_HARDCODED) {
|
|
#if RAPTOR_DEBUG
|
|
RAPTOR_DEBUG2("Got hardcoded XSLT match for %s\n", xpathExpr);
|
|
#endif
|
|
/* return at first match, that's enough */
|
|
break;
|
|
} else {
|
|
grddl_xml_context* xml_context;
|
|
#if RAPTOR_DEBUG
|
|
RAPTOR_DEBUG2("Got single match URI '%s'\n", uri_string);
|
|
#endif
|
|
|
|
uri=raptor_new_uri_relative_to_base(base_uri, uri_string);
|
|
xml_context=raptor_new_xml_context(uri, base_uri);
|
|
raptor_sequence_push(seq, xml_context);
|
|
raptor_free_uri(uri);
|
|
}
|
|
|
|
if(base_uri)
|
|
raptor_free_uri(base_uri);
|
|
}
|
|
|
|
cleanup_xpath_match:
|
|
if(xpathObj)
|
|
xmlXPathFreeObject(xpathObj);
|
|
|
|
return seq;
|
|
}
|
|
|
|
|
|
static void
|
|
raptor_grddl_check_recursive_content_type_handler(raptor_www* www,
|
|
void* userdata,
|
|
const char* content_type)
|
|
{
|
|
raptor_parser* rdf_parser=(raptor_parser*)userdata;
|
|
raptor_grddl_parser_context* grddl_parser=(raptor_grddl_parser_context*)rdf_parser->context;
|
|
size_t len;
|
|
|
|
if(!content_type)
|
|
return;
|
|
|
|
len=strlen(content_type)+1;
|
|
if(grddl_parser->content_type)
|
|
RAPTOR_FREE(cstring,grddl_parser->content_type);
|
|
grddl_parser->content_type=(char*)RAPTOR_MALLOC(cstring, len+1);
|
|
strncpy(grddl_parser->content_type, content_type, len+1);
|
|
|
|
if(!strncmp(content_type, "application/rdf+xml", 19)) {
|
|
grddl_parser->process_this_as_rdfxml=1;
|
|
|
|
RAPTOR_DEBUG2("Parser %p: Found RDF/XML content type\n", rdf_parser);
|
|
raptor_parser_save_content(rdf_parser, 1);
|
|
}
|
|
|
|
if(!strncmp(content_type, "text/html", 9) ||
|
|
!strncmp(content_type, "application/html+xml", 20)) {
|
|
RAPTOR_DEBUG3("Parser %p: Found HTML content type '%s'\n",
|
|
rdf_parser, content_type);
|
|
grddl_parser->html_base_processing=1;
|
|
}
|
|
|
|
}
|
|
|
|
#define RECURSIVE_FLAGS_IGNORE_ERRORS 1
|
|
#define RECURSIVE_FLAGS_FILTER 2
|
|
|
|
static int
|
|
raptor_grddl_run_recursive(raptor_parser* rdf_parser, raptor_uri* uri,
|
|
const char *parser_name, int flags)
|
|
{
|
|
raptor_grddl_parser_context* grddl_parser;
|
|
raptor_www_content_type_handler content_type_handler=NULL;
|
|
int ret=0;
|
|
const unsigned char* ibuffer=NULL;
|
|
size_t ibuffer_len=0;
|
|
raptor_parse_bytes_context rpbc;
|
|
int ignore_errors=(flags & RECURSIVE_FLAGS_IGNORE_ERRORS) > 0;
|
|
int filter=(flags & RECURSIVE_FLAGS_FILTER) > 0;
|
|
int fetch_uri_flags=0;
|
|
int is_grddl=!strcmp(parser_name, "grddl");
|
|
|
|
grddl_parser=(raptor_grddl_parser_context*)rdf_parser->context;
|
|
|
|
if(raptor_grddl_seen_uri(grddl_parser, uri))
|
|
return 0;
|
|
|
|
if(is_grddl)
|
|
content_type_handler=raptor_grddl_check_recursive_content_type_handler;
|
|
|
|
if(raptor_grddl_ensure_internal_parser(rdf_parser, parser_name, filter))
|
|
return !ignore_errors;
|
|
|
|
RAPTOR_DEBUG3("Running recursive %s operation on URI '%s'\n",
|
|
parser_name, raptor_uri_as_string(uri));
|
|
|
|
grddl_parser->internal_parser->default_generate_id_handler_base=
|
|
raptor_parser_get_current_base_id(rdf_parser);
|
|
|
|
if(is_grddl)
|
|
raptor_grddl_parser_add_parent(grddl_parser->internal_parser, grddl_parser);
|
|
|
|
rpbc.rdf_parser=grddl_parser->internal_parser;
|
|
rpbc.base_uri=NULL;
|
|
rpbc.final_uri=NULL;
|
|
rpbc.started=0;
|
|
|
|
if(ignore_errors)
|
|
fetch_uri_flags |=FETCH_IGNORE_ERRORS;
|
|
|
|
if(raptor_grddl_fetch_uri(grddl_parser->internal_parser,
|
|
uri,
|
|
raptor_parse_uri_write_bytes, &rpbc,
|
|
content_type_handler, grddl_parser->internal_parser,
|
|
fetch_uri_flags)) {
|
|
if(!ignore_errors)
|
|
raptor_parser_warning(rdf_parser,
|
|
"Fetching GRDDL document URI '%s' failed\n",
|
|
raptor_uri_as_string(uri));
|
|
ret=0;
|
|
goto tidy;
|
|
}
|
|
|
|
if(ignore_errors) {
|
|
raptor_error_handlers* eh=&grddl_parser->internal_parser->error_handlers;
|
|
int i;
|
|
/* NOTE not setting RAPTOR_LOG_LEVEL_NONE handler */
|
|
for(i=1; i <= (int)eh->last_log_level; i++) {
|
|
eh->handlers[i].handler=raptor_grddl_discard_message;
|
|
eh->handlers[i].user_data=NULL;
|
|
}
|
|
}
|
|
|
|
raptor_parse_chunk(grddl_parser->internal_parser, NULL, 0, 1);
|
|
rdf_parser->default_generate_id_handler_base=
|
|
raptor_parser_get_current_base_id(grddl_parser->internal_parser);
|
|
|
|
/* If content was saved, process it as RDF/XML */
|
|
ibuffer=raptor_parser_get_content(grddl_parser->internal_parser,
|
|
&ibuffer_len);
|
|
if(ibuffer && strcmp(parser_name, "rdfxml")) {
|
|
RAPTOR_DEBUG2("Running additional RDF/XML parse on URI '%s' content\n",
|
|
raptor_uri_as_string(uri));
|
|
|
|
if(raptor_grddl_ensure_internal_parser(rdf_parser, "rdfxml", 1))
|
|
ret=1;
|
|
else {
|
|
grddl_parser->internal_parser->default_generate_id_handler_base=
|
|
raptor_parser_get_current_base_id(rdf_parser);
|
|
|
|
if(raptor_start_parse(grddl_parser->internal_parser, uri))
|
|
ret=1;
|
|
else {
|
|
ret=raptor_parse_chunk(grddl_parser->internal_parser, ibuffer,
|
|
ibuffer_len, 1);
|
|
rdf_parser->default_generate_id_handler_base=
|
|
raptor_parser_get_current_base_id(grddl_parser->internal_parser);
|
|
}
|
|
}
|
|
|
|
RAPTOR_FREE(cstring, ibuffer);
|
|
raptor_parser_save_content(grddl_parser->internal_parser, 0);
|
|
}
|
|
|
|
if(rpbc.final_uri)
|
|
raptor_free_uri(rpbc.final_uri);
|
|
|
|
if(ignore_errors)
|
|
ret=0;
|
|
|
|
tidy:
|
|
|
|
return ret;
|
|
}
|
|
|
|
|
|
static void
|
|
raptor_grddl_libxml_discard_error(void* user_data, const char *msg, ...)
|
|
{
|
|
return;
|
|
}
|
|
|
|
|
|
static int
|
|
raptor_grddl_parse_chunk(raptor_parser* rdf_parser,
|
|
const unsigned char *s, size_t len,
|
|
int is_end)
|
|
{
|
|
raptor_grddl_parser_context* grddl_parser=(raptor_grddl_parser_context*)rdf_parser->context;
|
|
int i;
|
|
int ret=0;
|
|
const unsigned char* uri_string;
|
|
raptor_uri* uri;
|
|
/* XML document DOM */
|
|
xmlDocPtr doc;
|
|
int expri;
|
|
const unsigned char* buffer=NULL;
|
|
size_t buffer_len=0;
|
|
int buffer_is_libxml=0;
|
|
int loop;
|
|
raptor_error_handlers eh;
|
|
|
|
if(grddl_parser->content_type && !grddl_parser->content_type_check) {
|
|
grddl_parser->content_type_check++;
|
|
if(!strncmp(grddl_parser->content_type, "application/rdf+xml", 19)) {
|
|
RAPTOR_DEBUG3("Parser %p: Found document with type '%s' is RDF/XML\n",
|
|
rdf_parser, grddl_parser->content_type);
|
|
grddl_parser->process_this_as_rdfxml=1;
|
|
}
|
|
if(!strncmp(grddl_parser->content_type, "text/html", 9) ||
|
|
!strncmp(grddl_parser->content_type, "application/html+xml", 20)) {
|
|
RAPTOR_DEBUG3("Parser %p: Found document with type '%s' is HTML\n",
|
|
rdf_parser, grddl_parser->content_type);
|
|
grddl_parser->html_base_processing=1;
|
|
}
|
|
}
|
|
|
|
if(!grddl_parser->sb)
|
|
grddl_parser->sb=raptor_new_stringbuffer();
|
|
|
|
raptor_stringbuffer_append_counted_string(grddl_parser->sb, s, len, 1);
|
|
|
|
if(!is_end)
|
|
return 0;
|
|
|
|
buffer_len=raptor_stringbuffer_length(grddl_parser->sb);
|
|
buffer=(const unsigned char*)RAPTOR_MALLOC(cstring, buffer_len+1);
|
|
if(buffer)
|
|
raptor_stringbuffer_copy_to_string(grddl_parser->sb,
|
|
(unsigned char*)buffer, buffer_len);
|
|
|
|
|
|
uri_string=raptor_uri_as_string(rdf_parser->base_uri);
|
|
|
|
if(1) {
|
|
raptor_error_handlers_init(&eh);
|
|
eh.last_log_level=rdf_parser->error_handlers.last_log_level;
|
|
|
|
/* Save error handlers and discard parsing errors
|
|
* NOTE not setting RAPTOR_LOG_LEVEL_NONE handler
|
|
*/
|
|
memcpy(&eh.handlers, &rdf_parser->error_handlers,
|
|
sizeof(raptor_message_handler_closure) * (1+eh.last_log_level));
|
|
for(i=1; i <= (int)eh.last_log_level; i++) {
|
|
rdf_parser->error_handlers.handlers[i].handler=raptor_grddl_discard_message;
|
|
rdf_parser->error_handlers.handlers[i].user_data=NULL;
|
|
}
|
|
}
|
|
|
|
RAPTOR_DEBUG4("Parser %p: URI %s: processing %d bytes of content\n",
|
|
rdf_parser, uri_string, (int)buffer_len);
|
|
|
|
for(loop=0; loop<2; loop++) {
|
|
int rc;
|
|
|
|
if(loop == 0) {
|
|
int libxml_options = 0;
|
|
|
|
RAPTOR_DEBUG2("Parser %p: Creating an XML parser\n", rdf_parser);
|
|
|
|
/* try to create an XML parser context */
|
|
grddl_parser->xml_ctxt = xmlCreatePushParserCtxt(NULL, NULL,
|
|
(const char*)buffer,
|
|
buffer_len,
|
|
(const char*)uri_string);
|
|
if(!grddl_parser->xml_ctxt) {
|
|
RAPTOR_DEBUG2("Parser %p: Creating an XML parser failed\n", rdf_parser);
|
|
continue;
|
|
}
|
|
|
|
#ifdef RAPTOR_LIBXML_XML_PARSE_NONET
|
|
if(rdf_parser->features[RAPTOR_FEATURE_NO_NET])
|
|
libxml_options |= XML_PARSE_NONET;
|
|
#endif
|
|
#ifdef HAVE_XMLCTXTUSEOPTIONS
|
|
xmlCtxtUseOptions(grddl_parser->xml_ctxt, libxml_options);
|
|
#endif
|
|
|
|
|
|
grddl_parser->xml_ctxt->vctxt.warning = raptor_grddl_libxml_discard_error;
|
|
grddl_parser->xml_ctxt->vctxt.error = raptor_grddl_libxml_discard_error;
|
|
|
|
grddl_parser->xml_ctxt->replaceEntities = 1;
|
|
grddl_parser->xml_ctxt->loadsubset = 1;
|
|
} else if (loop == 1) {
|
|
|
|
/* try to create an HTML parser context */
|
|
if(rdf_parser->features[RAPTOR_FEATURE_HTML_TAG_SOUP]) {
|
|
xmlCharEncoding enc;
|
|
int options;
|
|
|
|
RAPTOR_DEBUG2("Parser %p: Creating an HTML parser\n", rdf_parser);
|
|
|
|
enc = xmlDetectCharEncoding((const unsigned char*)buffer, buffer_len);
|
|
grddl_parser->html_ctxt = htmlCreatePushParserCtxt(/*sax*/ NULL,
|
|
/*user_data*/ NULL,
|
|
(const char *)buffer,
|
|
buffer_len,
|
|
(const char *)uri_string,
|
|
enc);
|
|
if(!grddl_parser->html_ctxt) {
|
|
RAPTOR_DEBUG2("Parser %p: Creating an HTML parser failed\n", rdf_parser);
|
|
continue;
|
|
}
|
|
|
|
/* HTML parser */
|
|
grddl_parser->html_ctxt->replaceEntities = 1;
|
|
grddl_parser->html_ctxt->loadsubset = 1;
|
|
|
|
grddl_parser->html_ctxt->vctxt.error = raptor_grddl_libxml_discard_error;
|
|
|
|
/* HTML_PARSE_NOWARNING disables sax->warning, vxtxt.warning */
|
|
/* HTML_PARSE_NOERROR disables sax->error, vctxt.error */
|
|
options = HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING;
|
|
#ifdef HTML_PARSE_RECOVER
|
|
options |= HTML_PARSE_RECOVER;
|
|
#endif
|
|
#ifdef RAPTOR_LIBXML_HTML_PARSE_NONET
|
|
if(rdf_parser->features[RAPTOR_FEATURE_NO_NET])
|
|
options |= HTML_PARSE_NONET;
|
|
#endif
|
|
|
|
htmlCtxtUseOptions(grddl_parser->html_ctxt, options);
|
|
|
|
} else
|
|
continue;
|
|
} else
|
|
continue;
|
|
|
|
xmlSetStructuredErrorFunc(&rdf_parser->error_handlers,
|
|
raptor_libxml_xmlStructuredErrorFunc);
|
|
|
|
rc=0;
|
|
|
|
if(grddl_parser->html_ctxt) {
|
|
RAPTOR_DEBUG2("Parser %p: Parsing as HTML\n", rdf_parser);
|
|
rc=htmlParseChunk(grddl_parser->html_ctxt, (const char*)s, 0, 1);
|
|
RAPTOR_DEBUG3("Parser %p: Parsing as HTML %s\n", rdf_parser, (rc ? "failed" : "succeeded"));
|
|
if(rc) {
|
|
if(grddl_parser->html_ctxt->myDoc) {
|
|
xmlFreeDoc(grddl_parser->html_ctxt->myDoc);
|
|
grddl_parser->html_ctxt->myDoc=NULL;
|
|
}
|
|
htmlFreeParserCtxt(grddl_parser->html_ctxt);
|
|
grddl_parser->html_ctxt=NULL;
|
|
}
|
|
} else {
|
|
RAPTOR_DEBUG2("Parser %p: Parsing as XML\n", rdf_parser);
|
|
rc=xmlParseChunk(grddl_parser->xml_ctxt, (const char*)s, 0, 1);
|
|
RAPTOR_DEBUG3("Parser %p: Parsing as XML %s\n", rdf_parser, (rc ? "failed" : "succeeded"));
|
|
if(rc) {
|
|
if(grddl_parser->xml_ctxt->myDoc) {
|
|
xmlFreeDoc(grddl_parser->xml_ctxt->myDoc);
|
|
grddl_parser->xml_ctxt->myDoc=NULL;
|
|
}
|
|
xmlFreeParserCtxt(grddl_parser->xml_ctxt);
|
|
grddl_parser->xml_ctxt=NULL;
|
|
}
|
|
}
|
|
|
|
if(!rc)
|
|
break;
|
|
|
|
}
|
|
|
|
if(1) {
|
|
/* Restore error handlers */
|
|
for(i=1; i<= (int)eh.last_log_level; i++)
|
|
rdf_parser->error_handlers.handlers[i].handler=eh.handlers[i].handler;
|
|
rdf_parser->error_handlers.handlers[i].user_data=eh.handlers[i].user_data;
|
|
}
|
|
|
|
if(!grddl_parser->html_ctxt && !grddl_parser->xml_ctxt) {
|
|
raptor_parser_error(rdf_parser, "Failed to create HTML or XML parsers");
|
|
ret=1;
|
|
goto tidy;
|
|
}
|
|
|
|
raptor_grddl_done_uri(grddl_parser, rdf_parser->base_uri);
|
|
|
|
if(grddl_parser->html_ctxt)
|
|
doc=grddl_parser->html_ctxt->myDoc;
|
|
else
|
|
doc=grddl_parser->xml_ctxt->myDoc;
|
|
if(!doc) {
|
|
raptor_parser_error(rdf_parser,
|
|
"Failed to create XML DOM for GRDDL document");
|
|
ret=1;
|
|
goto tidy;
|
|
}
|
|
|
|
if(!grddl_parser->grddl_processing)
|
|
goto transform;
|
|
|
|
|
|
if(grddl_parser->xinclude_processing) {
|
|
RAPTOR_DEBUG3("Parser %p: Running XInclude processing on URI '%s'\n",
|
|
rdf_parser, raptor_uri_as_string(rdf_parser->base_uri));
|
|
if(xmlXIncludeProcess(doc) < 0) {
|
|
raptor_parser_error(rdf_parser,
|
|
"XInclude processing failed for GRDDL document");
|
|
ret=1;
|
|
goto tidy;
|
|
} else {
|
|
int blen;
|
|
|
|
/* write the result of XML Include to buffer */
|
|
RAPTOR_FREE(cstring, buffer);
|
|
xmlDocDumpFormatMemory(doc, (xmlChar**)&buffer, &blen,
|
|
1 /* indent the result */);
|
|
buffer_len=blen;
|
|
buffer_is_libxml=1;
|
|
|
|
RAPTOR_DEBUG3("Parser %p: XML Include processing returned %d bytes document\n",
|
|
rdf_parser, (int)buffer_len);
|
|
}
|
|
}
|
|
|
|
|
|
RAPTOR_DEBUG3("Parser %p: Running top-level GRDDL on URI '%s'\n",
|
|
rdf_parser, raptor_uri_as_string(rdf_parser->base_uri));
|
|
|
|
/* Work out if there is a root namespace URI */
|
|
if(1) {
|
|
xmlNodePtr xnp;
|
|
xmlNsPtr rootNs = NULL;
|
|
const unsigned char* ns_uri_string=NULL;
|
|
|
|
xnp = xmlDocGetRootElement(doc);
|
|
if(xnp) {
|
|
rootNs = xnp->ns;
|
|
if(rootNs)
|
|
ns_uri_string = (const unsigned char*)(rootNs->href);
|
|
}
|
|
|
|
if(ns_uri_string) {
|
|
int n;
|
|
|
|
RAPTOR_DEBUG3("Parser %p: Root namespace URI is %s\n",
|
|
rdf_parser, ns_uri_string);
|
|
|
|
if(!strcmp((const char*)ns_uri_string,
|
|
(const char*)raptor_rdf_namespace_uri) &&
|
|
!strcmp((const char*)xnp->name, "RDF")) {
|
|
RAPTOR_DEBUG3("Parser %p: Root element of %s is rdf:RDF - process this as RDF/XML later\n",
|
|
rdf_parser, raptor_uri_as_string(rdf_parser->base_uri));
|
|
grddl_parser->process_this_as_rdfxml=1;
|
|
}
|
|
|
|
for(n=0; grddl_namespace_uris_ignore_list[n]; n++) {
|
|
if(!strcmp(grddl_namespace_uris_ignore_list[n],
|
|
(const char*)ns_uri_string)) {
|
|
/* ignore this namespace */
|
|
RAPTOR_DEBUG3("Parser %p: Ignoring GRDDL for namespace URI '%s'\n",
|
|
rdf_parser, ns_uri_string);
|
|
ns_uri_string=NULL;
|
|
break;
|
|
}
|
|
}
|
|
if(ns_uri_string) {
|
|
grddl_xml_context* xml_context;
|
|
|
|
grddl_parser->root_ns_uri=raptor_new_uri_relative_to_base(rdf_parser->base_uri,
|
|
ns_uri_string);
|
|
xml_context=raptor_new_xml_context(grddl_parser->root_ns_uri,
|
|
rdf_parser->base_uri);
|
|
raptor_sequence_push(grddl_parser->profile_uris, xml_context);
|
|
|
|
RAPTOR_DEBUG3("Parser %p: Processing GRDDL namespace URI '%s'\n",
|
|
rdf_parser,
|
|
raptor_uri_as_string(grddl_parser->root_ns_uri));
|
|
raptor_grddl_run_recursive(rdf_parser, grddl_parser->root_ns_uri,
|
|
"grddl",
|
|
RECURSIVE_FLAGS_IGNORE_ERRORS |
|
|
RECURSIVE_FLAGS_FILTER);
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
/* Always put something at the start of the list even if NULL
|
|
* so later it can be searched for in output triples
|
|
*/
|
|
if(!grddl_parser->root_ns_uri) {
|
|
grddl_xml_context* xml_context;
|
|
xml_context=raptor_new_xml_context(NULL, NULL);
|
|
raptor_sequence_push(grddl_parser->profile_uris, xml_context);
|
|
}
|
|
|
|
|
|
/* Create the XPath evaluation context */
|
|
if(!grddl_parser->xpathCtx) {
|
|
grddl_parser->xpathCtx = xmlXPathNewContext(doc);
|
|
if(!grddl_parser->xpathCtx) {
|
|
raptor_parser_error(rdf_parser,
|
|
"Failed to create XPath context for GRDDL document");
|
|
ret=1;
|
|
goto tidy;
|
|
}
|
|
|
|
xmlXPathRegisterNs(grddl_parser->xpathCtx,
|
|
(const xmlChar*)"html",
|
|
(const xmlChar*)"http://www.w3.org/1999/xhtml");
|
|
xmlXPathRegisterNs(grddl_parser->xpathCtx,
|
|
(const xmlChar*)"dataview",
|
|
(const xmlChar*)"http://www.w3.org/2003/g/data-view#");
|
|
}
|
|
|
|
/* Try <head profile> URIs */
|
|
if(1) {
|
|
raptor_sequence* result;
|
|
result=raptor_grddl_run_xpath_match(rdf_parser, doc,
|
|
(const xmlChar*)"/html:html/html:head/@profile",
|
|
MATCH_IS_VALUE_LIST | MATCH_IS_PROFILE);
|
|
if(result) {
|
|
RAPTOR_DEBUG4("Parser %p: Found %d <head profile> URIs in URI '%s'\n",
|
|
rdf_parser, raptor_sequence_size(result),
|
|
raptor_uri_as_string(rdf_parser->base_uri));
|
|
|
|
|
|
/* Store profile URIs, skipping NULLs or the GRDDL profile itself */
|
|
while(raptor_sequence_size(result)) {
|
|
grddl_xml_context* xml_context;
|
|
|
|
xml_context=(grddl_xml_context*)raptor_sequence_unshift(result);
|
|
if(!xml_context)
|
|
continue;
|
|
uri=xml_context->uri;
|
|
if(!strcmp("http://www.w3.org/2003/g/data-view",
|
|
(const char*)raptor_uri_as_string(uri))) {
|
|
RAPTOR_DEBUG3("Ignoring <head profile> of URI %s: URI %s\n",
|
|
raptor_uri_as_string(rdf_parser->base_uri),
|
|
raptor_uri_as_string(uri));
|
|
grddl_free_xml_context(xml_context);
|
|
continue;
|
|
}
|
|
raptor_sequence_push(grddl_parser->profile_uris, xml_context);
|
|
}
|
|
raptor_free_sequence(result);
|
|
|
|
|
|
/* Recursive GRDDL through all the <head profile> URIs */
|
|
for(i=1; i < raptor_sequence_size(grddl_parser->profile_uris); i++) {
|
|
grddl_xml_context* xml_context=(grddl_xml_context*)raptor_sequence_get_at(grddl_parser->profile_uris, i);
|
|
uri=xml_context->uri;
|
|
if(!uri)
|
|
continue;
|
|
|
|
RAPTOR_DEBUG4("Processing <head profile> #%d of URI %s: URI %s\n",
|
|
i, raptor_uri_as_string(rdf_parser->base_uri),
|
|
raptor_uri_as_string(uri));
|
|
ret=raptor_grddl_run_recursive(rdf_parser, uri,
|
|
"grddl",
|
|
RECURSIVE_FLAGS_IGNORE_ERRORS|
|
|
RECURSIVE_FLAGS_FILTER);
|
|
}
|
|
}
|
|
|
|
} /* end head profile URIs */
|
|
|
|
|
|
/* Try XHTML document with alternate forms
|
|
* <link type="application/rdf+xml" href="URI" />
|
|
* Value of @href is a URI
|
|
*/
|
|
if(grddl_parser->html_link_processing &&
|
|
rdf_parser->features[RAPTOR_FEATURE_HTML_LINK]) {
|
|
raptor_sequence* result;
|
|
result=raptor_grddl_run_xpath_match(rdf_parser, doc,
|
|
(const xmlChar*)"/html:html/html:head/html:link[@type=\"application/rdf+xml\"]/@href",
|
|
0);
|
|
if(result) {
|
|
RAPTOR_DEBUG4("Parser %p: Found %d <link> URIs in URI '%s'\n",
|
|
rdf_parser, raptor_sequence_size(result),
|
|
raptor_uri_as_string(rdf_parser->base_uri));
|
|
|
|
/* Recursively parse all the <link> URIs, skipping NULLs */
|
|
i=0;
|
|
while(raptor_sequence_size(result)) {
|
|
grddl_xml_context* xml_context;
|
|
|
|
xml_context=(grddl_xml_context*)raptor_sequence_unshift(result);
|
|
if(!xml_context)
|
|
continue;
|
|
|
|
uri=xml_context->uri;
|
|
if(uri) {
|
|
RAPTOR_DEBUG4("Processing <link> #%d of URI %s: URI %s\n",
|
|
i, raptor_uri_as_string(rdf_parser->base_uri),
|
|
raptor_uri_as_string(uri));
|
|
i++;
|
|
ret=raptor_grddl_run_recursive(rdf_parser, uri, "guess",
|
|
RECURSIVE_FLAGS_IGNORE_ERRORS);
|
|
}
|
|
grddl_free_xml_context(xml_context);
|
|
}
|
|
|
|
raptor_free_sequence(result);
|
|
}
|
|
}
|
|
|
|
|
|
/* Try all XPaths */
|
|
for(expri=0; match_table[expri].xpath; expri++) {
|
|
raptor_sequence* result;
|
|
int flags=match_table[expri].flags;
|
|
|
|
if((flags & MATCH_IS_HARDCODED) &&
|
|
!rdf_parser->features[RAPTOR_FEATURE_MICROFORMATS])
|
|
continue;
|
|
|
|
result=raptor_grddl_run_xpath_match(rdf_parser, doc,
|
|
match_table[expri].xpath, flags);
|
|
if(result) {
|
|
if(match_table[expri].xslt_sheet_uri) {
|
|
grddl_xml_context* xml_context;
|
|
|
|
/* Ignore what matched, use a hardcoded XSLT URI */
|
|
uri_string=match_table[expri].xslt_sheet_uri;
|
|
RAPTOR_DEBUG3("Parser %p: Using hard-coded XSLT URI '%s'\n",
|
|
rdf_parser, uri_string);
|
|
|
|
raptor_free_sequence(result);
|
|
result=raptor_new_sequence((raptor_sequence_free_handler*)grddl_free_xml_context, NULL);
|
|
|
|
uri=raptor_new_uri_relative_to_base(rdf_parser->base_uri, uri_string);
|
|
|
|
xml_context=raptor_new_xml_context(uri, rdf_parser->base_uri);
|
|
raptor_sequence_push(result, xml_context);
|
|
|
|
raptor_free_uri(uri);
|
|
}
|
|
|
|
while(raptor_sequence_size(result)) {
|
|
grddl_xml_context* xml_context=(grddl_xml_context*)raptor_sequence_unshift(result);
|
|
if(!xml_context)
|
|
break;
|
|
|
|
raptor_grddl_add_transform_xml_context(grddl_parser, xml_context);
|
|
}
|
|
raptor_free_sequence(result);
|
|
|
|
if(flags & MATCH_LAST)
|
|
break;
|
|
}
|
|
|
|
|
|
if(rdf_parser->failed)
|
|
break;
|
|
|
|
} /* end XPath expression loop */
|
|
|
|
if(rdf_parser->failed) {
|
|
ret=1;
|
|
goto tidy;
|
|
}
|
|
|
|
|
|
/* Process this document's content buffer as RDF/XML */
|
|
if(grddl_parser->process_this_as_rdfxml && buffer) {
|
|
RAPTOR_DEBUG3("Parser %p: Running additional RDF/XML parse on root document URI '%s' content\n",
|
|
rdf_parser, raptor_uri_as_string(rdf_parser->base_uri));
|
|
|
|
if(raptor_grddl_ensure_internal_parser(rdf_parser, "rdfxml", 0))
|
|
ret=1;
|
|
else {
|
|
grddl_parser->internal_parser->default_generate_id_handler_base=
|
|
raptor_parser_get_current_base_id(rdf_parser);
|
|
|
|
if(raptor_start_parse(grddl_parser->internal_parser,
|
|
rdf_parser->base_uri))
|
|
ret=1;
|
|
else {
|
|
ret=raptor_parse_chunk(grddl_parser->internal_parser, buffer,
|
|
buffer_len, 1);
|
|
rdf_parser->default_generate_id_handler_base=
|
|
raptor_parser_get_current_base_id(grddl_parser->internal_parser);
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
|
|
/* Apply all transformation URIs seen */
|
|
transform:
|
|
while(raptor_sequence_size(grddl_parser->doc_transform_uris)) {
|
|
grddl_xml_context* xml_context=(grddl_xml_context*)raptor_sequence_unshift(grddl_parser->doc_transform_uris);
|
|
ret=raptor_grddl_run_grddl_transform_uri(rdf_parser, xml_context, doc);
|
|
grddl_free_xml_context(xml_context);
|
|
if(ret)
|
|
break;
|
|
}
|
|
|
|
|
|
tidy:
|
|
if(buffer) {
|
|
if(buffer_is_libxml)
|
|
xmlFree((xmlChar*)buffer);
|
|
else
|
|
RAPTOR_FREE(cstring, buffer);
|
|
}
|
|
|
|
if(grddl_parser->sb) {
|
|
raptor_free_stringbuffer(grddl_parser->sb);
|
|
grddl_parser->sb=NULL;
|
|
}
|
|
|
|
if(grddl_parser->xml_ctxt) {
|
|
if(grddl_parser->xml_ctxt->myDoc) {
|
|
xmlFreeDoc(grddl_parser->xml_ctxt->myDoc);
|
|
grddl_parser->xml_ctxt->myDoc=NULL;
|
|
}
|
|
xmlFreeParserCtxt(grddl_parser->xml_ctxt);
|
|
grddl_parser->xml_ctxt=NULL;
|
|
}
|
|
if(grddl_parser->html_ctxt) {
|
|
if(grddl_parser->html_ctxt->myDoc) {
|
|
xmlFreeDoc(grddl_parser->html_ctxt->myDoc);
|
|
grddl_parser->html_ctxt->myDoc=NULL;
|
|
}
|
|
xmlFreeParserCtxt(grddl_parser->html_ctxt);
|
|
grddl_parser->html_ctxt=NULL;
|
|
}
|
|
|
|
if(grddl_parser->xpathCtx) {
|
|
xmlXPathFreeContext(grddl_parser->xpathCtx);
|
|
grddl_parser->xpathCtx=NULL;
|
|
}
|
|
|
|
return (ret != 0);
|
|
}
|
|
|
|
|
|
static int
|
|
raptor_grddl_parse_recognise_syntax(raptor_parser_factory* factory,
|
|
const unsigned char *buffer, size_t len,
|
|
const unsigned char *identifier,
|
|
const unsigned char *suffix,
|
|
const char *mime_type)
|
|
{
|
|
int score= 0;
|
|
|
|
if(suffix) {
|
|
if(!strcmp((const char*)suffix, "xhtml"))
|
|
score=7;
|
|
if(!strcmp((const char*)suffix, "html"))
|
|
score=2;
|
|
}
|
|
|
|
if(identifier) {
|
|
if(strstr((const char*)identifier, "xhtml"))
|
|
score+=5;
|
|
}
|
|
|
|
return score;
|
|
}
|
|
|
|
|
|
static void
|
|
raptor_grddl_parse_content_type_handler(raptor_parser* rdf_parser,
|
|
const char* content_type)
|
|
{
|
|
raptor_grddl_parser_context* grddl_parser=(raptor_grddl_parser_context*)rdf_parser->context;
|
|
|
|
if(content_type) {
|
|
size_t len=strlen(content_type)+1;
|
|
if(grddl_parser->content_type)
|
|
RAPTOR_FREE(cstring,grddl_parser->content_type);
|
|
|
|
grddl_parser->content_type=(char*)RAPTOR_MALLOC(cstring, len+1);
|
|
strncpy(grddl_parser->content_type, content_type, len+1);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
raptor_grddl_parser_register_factory(raptor_parser_factory *factory)
|
|
{
|
|
int rc=0;
|
|
|
|
factory->context_length = sizeof(raptor_grddl_parser_context);
|
|
|
|
factory->need_base_uri = 1;
|
|
|
|
factory->init = raptor_grddl_parse_init;
|
|
factory->terminate = raptor_grddl_parse_terminate;
|
|
factory->start = raptor_grddl_parse_start;
|
|
factory->chunk = raptor_grddl_parse_chunk;
|
|
factory->recognise_syntax = raptor_grddl_parse_recognise_syntax;
|
|
factory->content_type_handler= raptor_grddl_parse_content_type_handler;
|
|
|
|
rc+= raptor_parser_factory_add_mime_type(factory, "text/html", 2) != 0;
|
|
rc+= raptor_parser_factory_add_mime_type(factory, "application/xhtml+xml", 4) != 0;
|
|
|
|
return rc;
|
|
}
|
|
|
|
|
|
static xsltSecurityPrefsPtr raptor_xslt_sec = NULL;
|
|
|
|
int
|
|
raptor_init_parser_grddl_common(void)
|
|
{
|
|
#ifdef HAVE_XSLTINIT
|
|
xsltInit();
|
|
#endif
|
|
|
|
raptor_xslt_sec = xsltNewSecurityPrefs();
|
|
xsltSetDefaultSecurityPrefs(raptor_xslt_sec);
|
|
|
|
|
|
/* no read from file (read from URI with scheme = file) */
|
|
xsltSetSecurityPrefs(raptor_xslt_sec, XSLT_SECPREF_READ_FILE,
|
|
xsltSecurityForbid);
|
|
|
|
/* no create/write to file */
|
|
xsltSetSecurityPrefs(raptor_xslt_sec, XSLT_SECPREF_WRITE_FILE,
|
|
xsltSecurityForbid);
|
|
|
|
/* no create directory */
|
|
xsltSetSecurityPrefs(raptor_xslt_sec, XSLT_SECPREF_CREATE_DIRECTORY,
|
|
xsltSecurityForbid);
|
|
|
|
/* yes read from URI with scheme != file (XSLT_SECPREF_READ_NETWORK) */
|
|
|
|
/* no write to network (you can 'write' with GET params anyway) */
|
|
xsltSetSecurityPrefs(raptor_xslt_sec, XSLT_SECPREF_WRITE_NETWORK,
|
|
xsltSecurityForbid);
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
int
|
|
raptor_init_parser_grddl(void)
|
|
{
|
|
return !raptor_parser_register_factory("grddl",
|
|
"Gleaning Resource Descriptions from Dialects of Languages",
|
|
&raptor_grddl_parser_register_factory);
|
|
}
|
|
|
|
|
|
void
|
|
raptor_terminate_parser_grddl_common(void)
|
|
{
|
|
xsltCleanupGlobals();
|
|
|
|
if(raptor_xslt_sec)
|
|
xsltFreeSecurityPrefs(raptor_xslt_sec);
|
|
}
|
|
|