1
0
mirror of https://github.com/cookiengineer/audacity synced 2025-06-27 01:28:38 +02:00
audacity/lib-src/libraptor/src/raptor_rss.c
2010-01-24 09:19:39 +00:00

1325 lines
45 KiB
C

/* -*- Mode: c; c-basic-offset: 2 -*-
*
* raptor_rss.c - Raptor RSS tag soup parser
*
* Copyright (C) 2003-2007, David Beckett http://purl.org/net/dajobe/
* Copyright (C) 2003-2005, University of Bristol, UK http://www.bristol.ac.uk/
*
* Contributions:
* Copyright (C) 2004-2005, Suzan Foster <su@islief.nl>
*
* This package is Free Software and part of Redland http://librdf.org/
*
* It is licensed under the following three licenses as alternatives:
* 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
* 2. GNU General Public License (GPL) V2 or any newer version
* 3. Apache License, V2.0 or any newer version
*
* You may not use this file except in compliance with at least one of
* the above three licenses.
*
* See LICENSE.html or LICENSE.txt at the top of this package for the
* complete terms and further detail along with the license texts for
* the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively.
*
*
*/
#ifdef HAVE_CONFIG_H
#include <raptor_config.h>
#endif
#ifdef WIN32
#include <win32_raptor_config.h>
#endif
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <stdarg.h>
#ifdef HAVE_ERRNO_H
#include <errno.h>
#endif
/* Raptor includes */
#include "raptor.h"
#include "raptor_internal.h"
#include "raptor_rss.h"
/* local prototypes */
static void raptor_rss_insert_identifiers(raptor_parser* rdf_parser);
static void raptor_rss_uplift_items(raptor_parser* rdf_parser);
static int raptor_rss_emit(raptor_parser* rdf_parser);
static void raptor_rss_start_element_handler(void *user_data, raptor_xml_element* xml_element);
static void raptor_rss_end_element_handler(void *user_data, raptor_xml_element* xml_element);
static void raptor_rss_cdata_handler(void *user_data, raptor_xml_element* xml_element, const unsigned char *s, int len);
static void raptor_rss_comment_handler(void *user_data, raptor_xml_element* xml_element, const unsigned char *s);
/*
* RSS parser object
*/
struct raptor_rss_parser_s {
/* static model */
raptor_rss_model model;
/* current line */
char *line;
/* current line length */
int line_length;
/* current char in line buffer */
int offset;
/* static statement for use in passing to user code */
raptor_statement statement;
raptor_sax2 *sax2;
/* rss node type of current item */
raptor_rss_type current_type;
/* one place stack */
raptor_rss_type prev_type;
raptor_rss_fields_type current_field;
/* emptyness of current element */
int element_is_empty;
/* stack of namespaces */
raptor_namespace_stack *nstack;
/* non-0 if this is an atom 1.0 parser */
int is_atom;
/* namespaces declared here */
raptor_namespace* nspaces[RAPTOR_RSS_NAMESPACES_SIZE];
};
typedef struct raptor_rss_parser_s raptor_rss_parser;
typedef enum {
RAPTOR_RSS_CONTENT_TYPE_NONE,
RAPTOR_RSS_CONTENT_TYPE_XML,
RAPTOR_RSS_CONTENT_TYPE_TEXT
} raptor_rss_content_type;
struct raptor_rss_element_s
{
raptor_uri* uri;
const unsigned char *rel;
/* Two types of content */
raptor_rss_content_type type;
/* 1) XML */
raptor_xml_writer* xml_writer;
/* XML written to this iostream to the xml_content string */
raptor_iostream* iostream;
/* ends up here */
void *xml_content;
size_t xml_content_length;
/* 2) cdata */
raptor_stringbuffer* sb;
};
typedef struct raptor_rss_element_s raptor_rss_element;
static void
raptor_free_rss_element(raptor_rss_element *rss_element)
{
if(rss_element->uri)
raptor_free_uri(rss_element->uri);
if(rss_element->rel)
raptor_free_memory((void*)rss_element->rel);
if(rss_element->type == RAPTOR_RSS_CONTENT_TYPE_XML) {
if(rss_element->xml_writer)
raptor_free_xml_writer(rss_element->xml_writer);
if(rss_element->iostream)
raptor_free_iostream(rss_element->iostream);
if(rss_element->xml_content)
raptor_free_memory(rss_element->xml_content);
}
if(rss_element->sb)
raptor_free_stringbuffer(rss_element->sb);
RAPTOR_FREE(raptor_rss_element, rss_element);
}
static int
raptor_rss_parse_init(raptor_parser* rdf_parser, const char *name)
{
raptor_rss_parser* rss_parser=(raptor_rss_parser*)rdf_parser->context;
raptor_sax2* sax2;
const raptor_uri_handler *uri_handler;
void *uri_context;
int n;
raptor_rss_common_init();
raptor_rss_model_init(&rss_parser->model);
rss_parser->prev_type=RAPTOR_RSS_NONE;
rss_parser->current_field=RAPTOR_RSS_FIELD_NONE;
rss_parser->current_type=RAPTOR_RSS_NONE;
if(rss_parser->sax2) {
raptor_free_sax2(rss_parser->sax2);
rss_parser->sax2=NULL;
}
raptor_uri_get_handler(&uri_handler, &uri_context);
rss_parser->nstack=raptor_new_namespaces(uri_handler, uri_context,
NULL, NULL, /* errors */
1);
/* Initialise the namespaces */
for(n=0; n < RAPTOR_RSS_NAMESPACES_SIZE; n++) {
unsigned const char* prefix=(unsigned const char*)raptor_rss_namespaces_info[n].prefix;
raptor_uri* uri=raptor_rss_namespaces_info[n].uri;
raptor_namespace* nspace=NULL;
if(prefix && uri)
nspace=raptor_new_namespace_from_uri(rss_parser->nstack,
prefix, uri, 0);
rss_parser->nspaces[n]=nspace;
}
sax2=raptor_new_sax2(rdf_parser, &rdf_parser->error_handlers);
rss_parser->sax2=sax2;
raptor_sax2_set_start_element_handler(sax2, raptor_rss_start_element_handler);
raptor_sax2_set_end_element_handler(sax2, raptor_rss_end_element_handler);
raptor_sax2_set_characters_handler(sax2, raptor_rss_cdata_handler);
raptor_sax2_set_cdata_handler(sax2, raptor_rss_cdata_handler);
raptor_sax2_set_comment_handler(sax2, raptor_rss_comment_handler);
return 0;
}
static void
raptor_rss_parse_terminate(raptor_parser *rdf_parser)
{
raptor_rss_parser *rss_parser=(raptor_rss_parser*)rdf_parser->context;
int n;
if(rss_parser->sax2)
raptor_free_sax2(rss_parser->sax2);
raptor_rss_model_clear(&rss_parser->model);
/* Initialise the namespaces */
for(n=0; n < RAPTOR_RSS_NAMESPACES_SIZE; n++) {
if(rss_parser->nspaces[n])
raptor_free_namespace(rss_parser->nspaces[n]);
}
if(rss_parser->nstack)
raptor_free_namespaces(rss_parser->nstack);
raptor_rss_common_terminate();
}
static int
raptor_rss_parse_start(raptor_parser *rdf_parser)
{
raptor_uri *uri=rdf_parser->base_uri;
raptor_rss_parser* rss_parser=(raptor_rss_parser*)rdf_parser->context;
/* base URI required for RSS */
if(!uri)
return 1;
/* Optionally forbid network requests in the XML parser */
raptor_sax2_set_feature(rss_parser->sax2,
RAPTOR_FEATURE_NO_NET,
rdf_parser->features[RAPTOR_FEATURE_NO_NET]);
raptor_sax2_parse_start(rss_parser->sax2, uri);
return 0;
}
static void
raptor_rss_start_element_handler(void *user_data,
raptor_xml_element* xml_element)
{
raptor_parser *rdf_parser;
raptor_rss_parser *rss_parser;
raptor_rss_enclosure *enclosure=NULL;
raptor_uri* base_uri;
raptor_qname *el_qname;
const unsigned char *name;
int ns_attributes_count;
raptor_qname** named_attrs;
const raptor_namespace* el_nspace;
raptor_rss_element* rss_element;
rss_element=(raptor_rss_element*)RAPTOR_CALLOC(raptor_rss_element, sizeof(raptor_rss_element), 1);
rss_element->sb=raptor_new_stringbuffer();
xml_element->user_data=rss_element;
if(xml_element->parent) {
raptor_rss_element* parent_rss_element=(raptor_rss_element*)(xml_element->parent->user_data);
if(parent_rss_element->xml_writer)
rss_element->xml_writer=parent_rss_element->xml_writer;
}
if(rss_element->xml_writer) {
raptor_xml_writer_start_element(rss_element->xml_writer, xml_element);
return;
}
el_qname=raptor_xml_element_get_name(xml_element);
name=el_qname->local_name;
el_nspace=el_qname->nspace;
rdf_parser=(raptor_parser*)user_data;
rss_parser=(raptor_rss_parser*)rdf_parser->context;
base_uri=raptor_sax2_inscope_base_uri(rss_parser->sax2);
if(rss_parser->current_type == RAPTOR_RSS_NONE) {
if(!strcmp((const char*)name, "rss") ||
!strcmp((const char*)name, "rdf") ||
!strcmp((const char*)name, "RDF")) {
/* rss */
goto check_attributes;
} else if(!raptor_strcasecmp((const char*)name, "channel")) {
/* rss or atom 0.3 channel */
rss_parser->current_type=RAPTOR_RSS_CHANNEL;
} else if(!strcmp((const char*)name, "feed")) {
/* atom 1.0 feed */
rss_parser->current_type=RAPTOR_RSS_CHANNEL;
rss_parser->is_atom=1;
} else if(!strcmp((const char*)name, "item")) {
raptor_rss_model_add_item(&rss_parser->model);
rss_parser->current_type=RAPTOR_RSS_ITEM;
} else if(!strcmp((const char*)name, "entry")) {
raptor_rss_model_add_item(&rss_parser->model);
rss_parser->current_type=RAPTOR_RSS_ITEM;
rss_parser->is_atom=1;
} else {
int i;
rss_parser->current_type=RAPTOR_RSS_UNKNOWN;
for(i=0; i<RAPTOR_RSS_COMMON_SIZE; i++)
if(!strcmp((const char*)name, raptor_rss_types_info[i].name)) {
rss_parser->current_type=(raptor_rss_type)i;
break;
}
}
if(rss_parser->current_type == RAPTOR_RSS_UNKNOWN) {
RAPTOR_DEBUG2("Unknown start element named %s\n", name);
} else {
RAPTOR_DEBUG3("FOUND type %d - %s\n", rss_parser->current_type, raptor_rss_types_info[rss_parser->current_type].name);
if (rss_parser->current_type != RAPTOR_RSS_ITEM)
raptor_rss_model_add_common(&rss_parser->model,
rss_parser->current_type);
}
} else { /* have current_type, this is an element inside */
int i;
raptor_rss_type old_type=rss_parser->current_type;
/* check it is not a type here */
if(!strcmp((const char*)name, "item") ||
!strcmp((const char*)name, "entry")) {
raptor_rss_model_add_item(&rss_parser->model);
rss_parser->current_type=RAPTOR_RSS_ITEM;
} else {
for(i=0; i<RAPTOR_RSS_COMMON_SIZE; i++)
if(!strcmp((const char*)name, raptor_rss_types_info[i].name)) {
/* rss and atom clash on the author name field (rss) or type (atom) */
if(i != RAPTOR_ATOM_AUTHOR ||
(i == RAPTOR_ATOM_AUTHOR && rss_parser->is_atom)) {
rss_parser->current_type=(raptor_rss_type)i;
break;
}
}
}
if(rss_parser->current_type != old_type) {
RAPTOR_DEBUG6("FOUND element %s for type %d - %s INSIDE current type %d - %s\n", name, rss_parser->current_type, raptor_rss_types_info[rss_parser->current_type].name, old_type, raptor_rss_types_info[old_type].name);
raptor_rss_model_add_common(&rss_parser->model,
rss_parser->current_type);
rss_parser->prev_type=old_type;
goto check_attributes;
}
rss_parser->current_field=RAPTOR_RSS_FIELD_UNKNOWN;
for(i=0; i<RAPTOR_RSS_FIELDS_SIZE; i++)
if(!strcmp((const char*)name, raptor_rss_fields_info[i].name)) {
raptor_uri* nspace_URI=el_nspace ? raptor_namespace_get_uri(el_nspace) : NULL;
/* RSS 0.9 and RSS 1.1 namespaces => RSS 1.0 namespace */
if(nspace_URI &&
(raptor_uri_equals(nspace_URI, raptor_rss_namespaces_info[RSS0_9_NS].uri) ||
raptor_uri_equals(nspace_URI, raptor_rss_namespaces_info[RSS1_1_NS].uri))) {
nspace_URI=raptor_rss_namespaces_info[RSS1_0_NS].uri;
}
/* Atom 0.3 namespace => Atom 1.0 namespace */
if(nspace_URI &&
raptor_uri_equals(nspace_URI, raptor_rss_namespaces_info[ATOM0_3_NS].uri)) {
nspace_URI=raptor_rss_namespaces_info[ATOM1_0_NS].uri;
}
if(nspace_URI && raptor_rss_fields_info[i].nspace != RSS_NO_NS) {
raptor_uri* field_nspace_URI=raptor_rss_namespaces_info[raptor_rss_fields_info[i].nspace].uri;
if(raptor_uri_equals(nspace_URI, field_nspace_URI)) {
rss_parser->current_field=(raptor_rss_fields_type)i;
break;
}
} else {
rss_parser->current_field=(raptor_rss_fields_type)i;
break;
}
}
if(rss_parser->current_field==RAPTOR_RSS_FIELD_UNKNOWN) {
RAPTOR_DEBUG3("Unknown field element named %s inside type %s\n", name, raptor_rss_types_info[rss_parser->current_type].name);
} else if (rss_parser->current_field == RAPTOR_RSS_FIELD_ENCLOSURE ){
raptor_rss_item* update_item;
RAPTOR_DEBUG1("FOUND new enclosure\n");
if(rss_parser->current_type == RAPTOR_RSS_ITEM) {
update_item=rss_parser->model.last;
enclosure=raptor_rss_new_enclosure();
raptor_rss_item_add_enclosure(update_item, enclosure);
}
} else {
RAPTOR_DEBUG4("FOUND field %d - %s inside type %s\n", rss_parser->current_field, raptor_rss_fields_info[rss_parser->current_field].name, raptor_rss_types_info[rss_parser->current_type].name);
/* Rewrite item fields */
for(i=0; raptor_atom_to_rss[i].from != RAPTOR_RSS_FIELD_UNKNOWN; i++) {
if(raptor_atom_to_rss[i].from == rss_parser->current_field) {
rss_parser->current_field=raptor_atom_to_rss[i].to;
RAPTOR_DEBUG3("Rewrote into field %d - %s\n", rss_parser->current_field, raptor_rss_fields_info[rss_parser->current_field].name);
break;
}
}
}
}
check_attributes:
named_attrs=raptor_xml_element_get_attributes(xml_element);
ns_attributes_count=raptor_xml_element_get_attributes_count(xml_element);
/* Now check for attributes */
if(named_attrs && ns_attributes_count) {
int i;
for (i = 0; i < ns_attributes_count; i++) {
raptor_qname* attr=named_attrs[i];
const unsigned char* attrName = attr->local_name;
const unsigned char* attrValue = attr->value;
RAPTOR_DEBUG3(" attribute %s=%s\n", attrName, attrValue);
/* Pick a few attributes to care about */
if(!strcmp((const char*)attrName, "isPermaLink")) {
raptor_rss_item* update_item=rss_parser->model.last;
if(!strcmp((const char*)name, "guid")) {
/* <guid isPermaLink="..."> */
if(update_item) {
raptor_rss_field* field=raptor_rss_new_field();
RAPTOR_DEBUG1("fa1 - ");
raptor_rss_item_add_field(update_item, RAPTOR_RSS_FIELD_GUID, field);
if(!strcmp((const char*)attrValue, "true")) {
RAPTOR_DEBUG2(" setting guid to URI '%s'\n", attrValue);
field->uri=raptor_new_uri_relative_to_base(base_uri,
(const unsigned char*)attrValue);
} else {
size_t len=strlen((const char*)attrValue);
RAPTOR_DEBUG2(" setting guid to string '%s'\n", attrValue);
field->value=(unsigned char*)RAPTOR_MALLOC(cstring, len+1);
strncpy((char*)field->value, (char*)attrValue, len+1);
}
}
}
} else if(!strcmp((const char*)attrName, "url")) {
if(!strcmp((const char*)name, "source")) {
/* <source url="...">foo</source> */
if(rss_parser->model.last) {
/*
rss_parser->last->source_url=attrValue;
attrValue=NULL;
*/
}
} else if (!strcmp((const char*)name, "enclosure") && enclosure) {
RAPTOR_DEBUG2(" setting enclosure URL %s\n", attrValue);
enclosure->url=raptor_new_uri_relative_to_base(base_uri,
(const unsigned char*)attrValue);
}
} else if(!strcmp((const char*)attrName, "domain")) {
if(!strcmp((const char*)name, "category")) {
/* <category domain="URL">foo</source> */
if(rss_parser->model.last) {
/*
rss_parser->last->category_url=attrValue;
attrValue=NULL;
*/
}
}
} else if(!strcmp((const char*)attrName, "rel")) {
size_t len=strlen((const char*)attrValue);
RAPTOR_DEBUG2(" setting rel length %s\n", attrValue);
rss_element->rel=(unsigned char*)RAPTOR_MALLOC(cstring, len+1);
strncpy((char*)rss_element->rel, (const char*)attrValue, len+1);
attrValue=NULL;
} else if(!strcmp((const char*)attrName, "href")) {
if(rss_parser->current_field == RAPTOR_RSS_FIELD_LINK ||
rss_parser->current_field == RAPTOR_RSS_FIELD_ATOM_LINK) {
RAPTOR_DEBUG2(" setting href as URI string for type %s\n", raptor_rss_types_info[rss_parser->current_type].name);
if(rss_element->uri)
raptor_free_uri(rss_element->uri);
rss_element->uri=raptor_new_uri_relative_to_base(base_uri,
(const unsigned char*)attrValue);
}
} else if (!strcmp((const char*)attrName, "length")) {
if (!strcmp((const char*)name, "enclosure") && enclosure) {
size_t len=strlen((const char*)attrValue);
RAPTOR_DEBUG2(" setting enclosure length %s\n", attrValue);
enclosure->length=(char*)RAPTOR_MALLOC(cstring, len+1);
strncpy(enclosure->length, (char*)attrValue, len+1);
}
} else if (!strcmp((const char*)attrName, "type")) {
if (!strcmp((const char*)name, "enclosure") && enclosure) {
size_t len=strlen((const char*)attrValue);
RAPTOR_DEBUG2(" setting enclosure type %s\n", attrValue);
enclosure->type=(char*)RAPTOR_MALLOC(cstring, len+1);
strncpy(enclosure->type, (char*)attrValue, len+1);
} else if(rss_parser->current_field == RAPTOR_RSS_FIELD_ATOM_LINK) {
/* do nothing with atom link attribute type */
} else if(rss_parser->is_atom) {
/* Atom only typing */
if (!strcmp((const char*)attrValue, "xhtml") ||
!strcmp((const char*)attrValue, "xml") ||
strstr((const char*)attrValue, "+xml")) {
const raptor_uri_handler *uri_handler;
void *uri_context;
RAPTOR_DEBUG2(" found type '%s', making an XML writer\n",
attrValue);
raptor_uri_get_handler(&uri_handler, &uri_context);
rss_element->type=RAPTOR_RSS_CONTENT_TYPE_XML;
rss_element->iostream=raptor_new_iostream_to_string(&rss_element->xml_content, &rss_element->xml_content_length, raptor_alloc_memory);
rss_element->xml_writer=raptor_new_xml_writer(NULL,
uri_handler, uri_context,
rss_element->iostream,
(raptor_simple_message_handler)raptor_parser_simple_error, rdf_parser,
1);
raptor_xml_writer_set_feature(rss_element->xml_writer,
RAPTOR_FEATURE_WRITER_XML_DECLARATION, 0);
raptor_free_stringbuffer(rss_element->sb);
rss_element->sb=NULL;
}
}
} else if (!strcmp((const char*)attrName, "version")) {
if(!raptor_strcasecmp((const char*)name, "feed")) {
if(!strcmp((const char*)attrValue, "0.3"))
rss_parser->is_atom=1;
}
}
}
} /* if have attributes */
}
static void
raptor_rss_end_element_handler(void *user_data,
raptor_xml_element* xml_element)
{
raptor_parser* rdf_parser;
raptor_rss_parser* rss_parser;
#ifdef RAPTOR_DEBUG
const unsigned char* name=raptor_xml_element_get_name(xml_element)->local_name;
#endif
raptor_rss_element* rss_element;
size_t cdata_len=0;
unsigned char* cdata=NULL;
rss_element=(raptor_rss_element*)xml_element->user_data;
rdf_parser=(raptor_parser*)user_data;
rss_parser=(raptor_rss_parser*)rdf_parser->context;
if(rss_element->xml_writer) {
if(rss_element->type != RAPTOR_RSS_CONTENT_TYPE_XML) {
raptor_xml_writer_end_element(rss_element->xml_writer, xml_element);
goto tidy_end_element;
}
/* otherwise we are done making XML */
raptor_free_iostream(rss_element->iostream);
rss_element->iostream=NULL;
cdata=(unsigned char*)rss_element->xml_content;
cdata_len=rss_element->xml_content_length;
}
if(rss_element->sb) {
cdata_len=raptor_stringbuffer_length(rss_element->sb);
cdata=raptor_stringbuffer_as_string(rss_element->sb);
}
if(cdata) {
raptor_uri* base_uri=NULL;
base_uri=raptor_sax2_inscope_base_uri(rss_parser->sax2);
if((rss_parser->current_type==RAPTOR_RSS_NONE ||
rss_parser->current_type==RAPTOR_RSS_UNKNOWN) ||
(rss_parser->current_field==RAPTOR_RSS_FIELD_NONE ||
rss_parser->current_field==RAPTOR_RSS_FIELD_UNKNOWN)) {
unsigned char *p=cdata;
int i;
for(i=cdata_len; i>0 && *p; i--) {
if(!isspace(*p))
break;
p++;
}
if(i>0 && *p) {
RAPTOR_DEBUG4("IGNORING non-whitespace text '%s' inside type %s, field %s\n", cdata,
raptor_rss_types_info[rss_parser->current_type].name,
raptor_rss_fields_info[rss_parser->current_field].name);
}
goto do_end_element;
}
if(rss_parser->current_type >= RAPTOR_RSS_COMMON_IGNORED) {
/* skipHours, skipDays common but IGNORED */
RAPTOR_DEBUG2("Ignoring fields for type %s\n", raptor_rss_types_info[rss_parser->current_type].name);
} else {
raptor_rss_item* update_item;
raptor_rss_field* field=raptor_rss_new_field();
if(rss_parser->current_type == RAPTOR_RSS_ITEM)
update_item=rss_parser->model.last;
else
update_item=raptor_rss_model_get_common(&rss_parser->model,
rss_parser->current_type);
/* if value is always an uri, make it so */
if(raptor_rss_fields_info[rss_parser->current_field].flags &
RAPTOR_RSS_INFO_FLAG_URI_VALUE) {
RAPTOR_DEBUG4("Added URI %s to field %s of type %s\n", cdata, raptor_rss_fields_info[rss_parser->current_field].name, raptor_rss_types_info[rss_parser->current_type].name);
field->uri=raptor_new_uri_relative_to_base(base_uri, cdata);
} else {
RAPTOR_DEBUG4("Added text '%s' to field %s of type %s\n", cdata, raptor_rss_fields_info[rss_parser->current_field].name, raptor_rss_types_info[rss_parser->current_type].name);
field->uri=NULL;
field->value=(unsigned char*)RAPTOR_MALLOC(cstring, cdata_len+1);
strncpy((char*)field->value, (const char*)cdata, cdata_len);
field->value[cdata_len]='\0';
}
RAPTOR_DEBUG1("fa3 - ");
raptor_rss_item_add_field(update_item, rss_parser->current_field, field);
}
} /* end if contained cdata */
if(raptor_xml_element_is_empty(xml_element)) {
/* Empty element, so consider adding one of the attributes as
* literal or URI content
*/
if(rss_parser->current_type >= RAPTOR_RSS_COMMON_IGNORED) {
/* skipHours, skipDays common but IGNORED */
RAPTOR_DEBUG3("Ignoring empty element %s for type %s\n", name, raptor_rss_types_info[rss_parser->current_type].name);
} else if(rss_element->uri) {
raptor_rss_item* update_item;
raptor_rss_field* field=raptor_rss_new_field();
if(rss_parser->current_type == RAPTOR_RSS_ITEM)
update_item=rss_parser->model.last;
else
update_item=raptor_rss_model_get_common(&rss_parser->model,
rss_parser->current_type);
if(rss_parser->current_field == RAPTOR_RSS_FIELD_LINK &&
rss_element->rel &&
!strcmp((const char*)rss_element->rel, "alternate")) {
/* RSS with rel != alternate ignored FIXME */
} else if(rss_parser->current_field == RAPTOR_RSS_FIELD_UNKNOWN) {
RAPTOR_DEBUG2("Cannot add URI from alternate attribute to type %s unknown field\n", raptor_rss_types_info[rss_parser->current_type].name);
raptor_rss_field_free(field);
} else {
RAPTOR_DEBUG3("Added URI to field %s of type %s\n", raptor_rss_fields_info[rss_parser->current_field].name, raptor_rss_types_info[rss_parser->current_type].name);
field->uri=rss_element->uri;
rss_element->uri=NULL;
RAPTOR_DEBUG1("fa2 - ");
raptor_rss_item_add_field(update_item, rss_parser->current_field, field);
}
}
}
do_end_element:
if(rss_parser->current_type != RAPTOR_RSS_NONE) {
if(rss_parser->current_field != RAPTOR_RSS_FIELD_NONE) {
RAPTOR_DEBUG3("Ending element %s field %s\n", name, raptor_rss_fields_info[rss_parser->current_field].name);
rss_parser->current_field= RAPTOR_RSS_FIELD_NONE;
} else {
RAPTOR_DEBUG3("Ending element %s type %s\n", name, raptor_rss_types_info[rss_parser->current_type].name);
if(rss_parser->prev_type != RAPTOR_RSS_NONE) {
rss_parser->current_type=rss_parser->prev_type;
rss_parser->prev_type=RAPTOR_RSS_NONE;
RAPTOR_DEBUG3("Returning to type %d - %s\n", rss_parser->current_type, raptor_rss_types_info[rss_parser->current_type].name);
} else
rss_parser->current_type= RAPTOR_RSS_NONE;
}
}
tidy_end_element:
if(rss_element)
raptor_free_rss_element(rss_element);
}
static void
raptor_rss_cdata_handler(void *user_data, raptor_xml_element* xml_element,
const unsigned char *s, int len)
{
raptor_rss_element* rss_element;
rss_element=(raptor_rss_element*)xml_element->user_data;
if(rss_element->xml_writer) {
raptor_xml_writer_cdata_counted(rss_element->xml_writer, s, len);
return;
}
raptor_stringbuffer_append_counted_string(rss_element->sb, s, len, 1);
}
static void
raptor_rss_comment_handler(void *user_data, raptor_xml_element* xml_element,
const unsigned char *s)
{
raptor_rss_element* rss_element;
if(!xml_element)
return;
rss_element=(raptor_rss_element*)xml_element->user_data;
if(rss_element->xml_writer) {
raptor_xml_writer_comment(rss_element->xml_writer, s);
return;
}
}
static void
raptor_rss_insert_enclosure_identifiers(raptor_parser* rdf_parser,
raptor_rss_enclosure *enclosure)
{
raptor_identifier* identifier=&enclosure->identifier;
if (enclosure->url) {
/* emit as URI resource */
identifier->uri=raptor_uri_copy(enclosure->url);
identifier->type=RAPTOR_IDENTIFIER_TYPE_RESOURCE;
identifier->uri_source=RAPTOR_URI_SOURCE_URI;
} else {
/* emit as blank node */
identifier->id=raptor_parser_internal_generate_id(rdf_parser, RAPTOR_GENID_TYPE_BNODEID, NULL);
identifier->type=RAPTOR_IDENTIFIER_TYPE_ANONYMOUS;
identifier->uri_source=RAPTOR_URI_SOURCE_GENERATED;
}
enclosure->node_type=raptor_rss_types_info[RAPTOR_RSS_ENCLOSURE].uri;
}
static void
raptor_rss_insert_identifiers(raptor_parser* rdf_parser)
{
raptor_rss_parser* rss_parser=(raptor_rss_parser*)rdf_parser->context;
int i;
raptor_rss_item* item;
for(i=0; i< RAPTOR_RSS_COMMON_SIZE; i++) {
for(item=rss_parser->model.common[i]; item; item=item->next) {
raptor_identifier* identifier;
identifier=&(item->identifier);
if(!item->fields_count)
continue;
RAPTOR_DEBUG3("Inserting identifiers in common type %d - %s\n", i, raptor_rss_types_info[i].name);
if(item->uri) {
identifier->uri=raptor_uri_copy(item->uri);
identifier->type=RAPTOR_IDENTIFIER_TYPE_RESOURCE;
identifier->uri_source=RAPTOR_URI_SOURCE_URI;
} else {
int url_fields[2];
int url_fields_count=1;
int f;
url_fields[0]=(i== RAPTOR_RSS_IMAGE) ? RAPTOR_RSS_FIELD_URL :
RAPTOR_RSS_FIELD_LINK;
if(i == RAPTOR_RSS_CHANNEL) {
url_fields[1]=RAPTOR_RSS_FIELD_ATOM_ID;
url_fields_count++;
}
for(f=0; f < url_fields_count; f++) {
raptor_rss_field* field;
for(field=item->fields[url_fields[f]]; field; field=field->next) {
if(field->value) {
identifier->uri=raptor_new_uri((const unsigned char*)field->value);
identifier->type=RAPTOR_IDENTIFIER_TYPE_RESOURCE;
identifier->uri_source=RAPTOR_URI_SOURCE_URI;
break;
} else if(field->uri) {
identifier->uri=raptor_uri_copy(field->uri);
identifier->type=RAPTOR_IDENTIFIER_TYPE_RESOURCE;
identifier->uri_source=RAPTOR_URI_SOURCE_URI;
break;
}
}
}
if(!identifier->uri) {
/* need to make bnode */
identifier->id=raptor_parser_internal_generate_id(rdf_parser, RAPTOR_GENID_TYPE_BNODEID, NULL);
identifier->type=RAPTOR_IDENTIFIER_TYPE_ANONYMOUS;
identifier->uri_source=RAPTOR_URI_SOURCE_GENERATED;
}
}
item->node_type=&raptor_rss_types_info[i];
}
}
/* sequence of rss:item */
for(item=rss_parser->model.items; item; item=item->next) {
raptor_identifier* identifier=&item->identifier;
raptor_rss_enclosure* enclosure;
if(item->uri) {
identifier->uri=raptor_uri_copy(item->uri);
identifier->type=RAPTOR_IDENTIFIER_TYPE_RESOURCE;
identifier->uri_source=RAPTOR_URI_SOURCE_URI;
} else {
if (item->fields[RAPTOR_RSS_FIELD_LINK]) {
if (item->fields[RAPTOR_RSS_FIELD_LINK]->value) {
identifier->uri=raptor_new_uri((const unsigned char*)item->fields[RAPTOR_RSS_FIELD_LINK]->value);
identifier->type=RAPTOR_IDENTIFIER_TYPE_RESOURCE;
identifier->uri_source=RAPTOR_URI_SOURCE_URI;
} else if(item->fields[RAPTOR_RSS_FIELD_LINK]->uri) {
identifier->uri=raptor_uri_copy(item->fields[RAPTOR_RSS_FIELD_LINK]->uri);
identifier->type=RAPTOR_IDENTIFIER_TYPE_RESOURCE;
identifier->uri_source=RAPTOR_URI_SOURCE_URI;
}
} else if(item->fields[RAPTOR_RSS_FIELD_ATOM_ID]) {
if (item->fields[RAPTOR_RSS_FIELD_ATOM_ID]->value) {
identifier->uri=raptor_new_uri((const unsigned char*)item->fields[RAPTOR_RSS_FIELD_ATOM_ID]->value);
identifier->type=RAPTOR_IDENTIFIER_TYPE_RESOURCE;
identifier->uri_source=RAPTOR_URI_SOURCE_URI;
} else if(item->fields[RAPTOR_RSS_FIELD_ATOM_ID]->uri) {
identifier->uri=raptor_uri_copy(item->fields[RAPTOR_RSS_FIELD_ATOM_ID]->uri);
identifier->type=RAPTOR_IDENTIFIER_TYPE_RESOURCE;
identifier->uri_source=RAPTOR_URI_SOURCE_URI;
}
} else {
/* need to make bnode */
identifier->id=raptor_parser_internal_generate_id(rdf_parser, RAPTOR_GENID_TYPE_BNODEID, NULL);
identifier->type=RAPTOR_IDENTIFIER_TYPE_ANONYMOUS;
identifier->uri_source=RAPTOR_URI_SOURCE_GENERATED;
}
}
for(enclosure=item->enclosure; enclosure; enclosure=enclosure->next)
raptor_rss_insert_enclosure_identifiers(rdf_parser, enclosure);
item->node_type=&raptor_rss_types_info[RAPTOR_RSS_ITEM];
}
}
static int
raptor_rss_emit_type_triple(raptor_parser* rdf_parser,
raptor_identifier *resource,
raptor_uri *type_uri)
{
raptor_rss_parser* rss_parser=(raptor_rss_parser*)rdf_parser->context;
if(!resource->uri && !resource->id) {
raptor_parser_error(rdf_parser, "RSS node has no identifier");
return 1;
}
rss_parser->statement.subject=resource->uri ? (void*)resource->uri : (void*)resource->id;
rss_parser->statement.subject_type=resource->type;
rss_parser->statement.predicate=RAPTOR_RSS_RDF_type_URI(&rss_parser->model);
rss_parser->statement.predicate_type=RAPTOR_IDENTIFIER_TYPE_RESOURCE;
rss_parser->statement.object=(void*)type_uri;
rss_parser->statement.object_type=RAPTOR_IDENTIFIER_TYPE_RESOURCE;
rss_parser->statement.object_literal_language=NULL;
rss_parser->statement.object_literal_datatype=NULL;
/* Generate the statement */
(*rdf_parser->statement_handler)(rdf_parser->user_data, &rss_parser->statement);
return 0;
}
static int
raptor_rss_emit_enclosure(raptor_parser* rdf_parser,
raptor_rss_enclosure *enclosure)
{
raptor_rss_parser* rss_parser=(raptor_rss_parser*)rdf_parser->context;
raptor_identifier* identifier=&enclosure->identifier;
const void* subject=rss_parser->statement.subject;
if(!identifier->uri && !identifier->id) {
raptor_parser_error(rdf_parser, "Enclosure has no identifier");
return 1;
}
rss_parser->statement.predicate=raptor_rss_fields_info[RAPTOR_RSS_RDF_ENCLOSURE].uri;
rss_parser->statement.predicate_type=RAPTOR_IDENTIFIER_TYPE_RESOURCE;
if (identifier->uri) {
/* emit as resource */
rss_parser->statement.object=identifier->uri;
rss_parser->statement.object_type=RAPTOR_IDENTIFIER_TYPE_RESOURCE;
} else {
/* emit as blank node */
rss_parser->statement.object=identifier->id;
rss_parser->statement.object_type=RAPTOR_IDENTIFIER_TYPE_ANONYMOUS;
}
rss_parser->statement.object_literal_language=NULL;
rss_parser->statement.object_literal_datatype=NULL;
(*rdf_parser->statement_handler)(rdf_parser->user_data, &rss_parser->statement);
if(raptor_rss_emit_type_triple(rdf_parser, identifier, enclosure->node_type))
return 1;
if (enclosure->url) {
rss_parser->statement.predicate=raptor_rss_fields_info[RAPTOR_RSS_RDF_ENCLOSURE_URL].uri;
rss_parser->statement.object=enclosure->url;
rss_parser->statement.object_type=RAPTOR_IDENTIFIER_TYPE_RESOURCE;
(*rdf_parser->statement_handler)(rdf_parser->user_data, &rss_parser->statement);
}
if (enclosure->type) {
rss_parser->statement.predicate=raptor_rss_fields_info[RAPTOR_RSS_RDF_ENCLOSURE_TYPE].uri;
rss_parser->statement.object=enclosure->type;
rss_parser->statement.object_type=RAPTOR_IDENTIFIER_TYPE_LITERAL;
(*rdf_parser->statement_handler)(rdf_parser->user_data, &rss_parser->statement);
}
if (enclosure->length) {
rss_parser->statement.predicate=raptor_rss_fields_info[RAPTOR_RSS_RDF_ENCLOSURE_LENGTH].uri;
rss_parser->statement.object=enclosure->length;
rss_parser->statement.object_type=RAPTOR_IDENTIFIER_TYPE_LITERAL;
(*rdf_parser->statement_handler)(rdf_parser->user_data, &rss_parser->statement);
}
rss_parser->statement.subject=subject;
return 0;
}
static int
raptor_rss_emit_item(raptor_parser* rdf_parser, raptor_rss_item *item)
{
raptor_rss_parser* rss_parser=(raptor_rss_parser*)rdf_parser->context;
int f;
raptor_identifier* identifier=&item->identifier;
raptor_rss_enclosure* enclosure;
if(!item->fields_count)
return 0;
if(raptor_rss_emit_type_triple(rdf_parser, identifier, item->node_type->uri))
return 1;
for(f=0; f< RAPTOR_RSS_FIELDS_SIZE; f++) {
raptor_rss_field* field;
/* This is only made by a connection */
if(f == RAPTOR_RSS_FIELD_ITEMS)
continue;
rss_parser->statement.predicate=raptor_rss_fields_info[f].uri;
if(!rss_parser->statement.predicate)
continue;
rss_parser->statement.predicate_type=RAPTOR_IDENTIFIER_TYPE_RESOURCE;
for (field=item->fields[f]; field; field=field->next) {
rss_parser->statement.object_literal_language=NULL;
rss_parser->statement.object_literal_datatype=NULL;
if(field->value) {
rss_parser->statement.object=field->value;
rss_parser->statement.object_type=RAPTOR_IDENTIFIER_TYPE_LITERAL;
/* FIXME - should store and emit languages */
} else {
rss_parser->statement.object=field->uri;
rss_parser->statement.object_type=RAPTOR_IDENTIFIER_TYPE_RESOURCE;
}
/* Generate the statement */
(*rdf_parser->statement_handler)(rdf_parser->user_data, &rss_parser->statement);
}
}
for(enclosure=item->enclosure; enclosure; enclosure=enclosure->next) {
raptor_rss_emit_enclosure(rdf_parser, enclosure);
}
return 0;
}
static int
raptor_rss_emit_connection(raptor_parser* rdf_parser,
raptor_identifier *subject_identifier,
raptor_uri predicate_uri, int predicate_ordinal,
raptor_identifier *object_identifier)
{
raptor_rss_parser* rss_parser=(raptor_rss_parser*)rdf_parser->context;
if(!subject_identifier->uri && !subject_identifier->id) {
raptor_parser_error(rdf_parser, "Connection subject has no identifier");
return 1;
}
rss_parser->statement.subject=subject_identifier->uri ? (void*)subject_identifier->uri : (void*)subject_identifier->id;
rss_parser->statement.subject_type=subject_identifier->type;
if(predicate_uri) {
rss_parser->statement.predicate=predicate_uri;
rss_parser->statement.predicate_type=RAPTOR_IDENTIFIER_TYPE_RESOURCE;
} else {
rss_parser->statement.predicate=(void*)&predicate_ordinal;
rss_parser->statement.predicate_type=RAPTOR_IDENTIFIER_TYPE_ORDINAL;
}
rss_parser->statement.object=object_identifier->uri ? (void*)object_identifier->uri : (void*)object_identifier->id;
rss_parser->statement.object_type=object_identifier->type;
rss_parser->statement.object_literal_language=NULL;
rss_parser->statement.object_literal_datatype=NULL;
/* Generate the statement */
(*rdf_parser->statement_handler)(rdf_parser->user_data, &rss_parser->statement);
return 0;
}
static int
raptor_rss_emit(raptor_parser* rdf_parser)
{
raptor_rss_parser* rss_parser=(raptor_rss_parser*)rdf_parser->context;
int i;
raptor_rss_item* item;
if (!rss_parser->model.common[RAPTOR_RSS_CHANNEL]) {
raptor_parser_error(rdf_parser, "No RSS channel item present");
return 1;
}
if(!rss_parser->model.common[RAPTOR_RSS_CHANNEL]->identifier.uri &&
!rss_parser->model.common[RAPTOR_RSS_CHANNEL]->identifier.id) {
raptor_parser_error(rdf_parser, "RSS channel has no identifier");
return 1;
}
for (i=0; i< RAPTOR_RSS_COMMON_SIZE; i++) {
for (item=rss_parser->model.common[i]; item; item=item->next) {
if(!item->fields_count)
continue;
RAPTOR_DEBUG3("Emitting type %i - %s\n", i, raptor_rss_types_info[i].name);
if(!item->identifier.uri && !item->identifier.id) {
raptor_parser_error(rdf_parser, "RSS %s has no identifier", raptor_rss_types_info[i].name);
return 1;
}
if(raptor_rss_emit_item(rdf_parser, item))
return 1;
/* Add connections to channel */
if(i != RAPTOR_RSS_CHANNEL) {
if(raptor_rss_emit_connection(rdf_parser,
&(rss_parser->model.common[RAPTOR_RSS_CHANNEL]->identifier),
raptor_rss_types_info[i].uri, 0,
&(item->identifier)))
return 1;
}
}
}
if(rss_parser->model.items_count) {
raptor_identifier *items;
/* make a new genid for the <rdf:Seq> node */
items=raptor_new_identifier(RAPTOR_IDENTIFIER_TYPE_ANONYMOUS,
NULL, RAPTOR_URI_SOURCE_GENERATED,
(const unsigned char*)raptor_parser_internal_generate_id(rdf_parser, RAPTOR_GENID_TYPE_BNODEID, NULL),
NULL, NULL, NULL);
/* _:genid1 rdf:type rdf:Seq . */
if(raptor_rss_emit_type_triple(rdf_parser, items,
RAPTOR_RSS_RDF_Seq_URI(&rss_parser->model))) {
raptor_free_identifier(items);
return 1;
}
/* <channelURI> rss:items _:genid1 . */
if(raptor_rss_emit_connection(rdf_parser,
&(rss_parser->model.common[RAPTOR_RSS_CHANNEL]->identifier),
raptor_rss_fields_info[RAPTOR_RSS_FIELD_ITEMS].uri, 0,
items)) {
raptor_free_identifier(items);
return 1;
}
/* sequence of rss:item */
for(i=1, item=rss_parser->model.items; item; item=item->next, i++) {
if(raptor_rss_emit_item(rdf_parser, item) ||
raptor_rss_emit_connection(rdf_parser,
items,
NULL, i,
&(item->identifier))) {
raptor_free_identifier(items);
return 1;
}
}
raptor_free_identifier(items);
}
return 0;
}
static const raptor_field_pair raptor_rss_uplift_map[]={
/* from */ /* to */
#ifdef RAPTOR_PARSEDATE_FUNCTION
/* convert to ISO date */
{ RAPTOR_RSS_FIELD_PUBDATE, RAPTOR_RSS_FIELD_DC_DATE },
/* default action: copy fields */
{ RAPTOR_RSS_FIELD_ATOM_UPDATED, RAPTOR_RSS_FIELD_DC_DATE },
#endif
/* default actions: copy fields */
{ RAPTOR_RSS_FIELD_DESCRIPTION, RAPTOR_RSS_FIELD_CONTENT_ENCODED },
{ RAPTOR_RSS_FIELD_UNKNOWN, RAPTOR_RSS_FIELD_UNKNOWN }
};
static void
raptor_rss_uplift_fields(raptor_rss_item* item)
{
int i;
for(i=0; raptor_rss_uplift_map[i].from != RAPTOR_RSS_FIELD_UNKNOWN; i++) {
raptor_rss_fields_type from_field=raptor_rss_uplift_map[i].from;
raptor_rss_fields_type to_field=raptor_rss_uplift_map[i].to;
raptor_rss_field* field=NULL;
size_t len;
if(!(item->fields[from_field] && item->fields[from_field]->value))
continue;
if(from_field == to_field) {
field=item->fields[from_field];
} else {
if(item->fields[to_field] && item->fields[to_field]->value)
continue;
field=raptor_rss_new_field();
raptor_rss_item_add_field(item, to_field, field);
}
#ifdef RAPTOR_PARSEDATE_FUNCTION
/* Get rid of date soup */
if(from_field == RAPTOR_RSS_FIELD_PUBDATE
#if 0
/* or normalize to UTC */
||
from_field == RAPTOR_RSS_FIELD_ATOM_PUBLISHED ||
from_field == RAPTOR_RSS_FIELD_ATOM_UPDATED
#endif
)
raptor_rss_date_uplift(field, item->fields[from_field]->value);
#endif
if(!field->value) {
/* Otherwise default action is to copy from_field value */
len=strlen((const char*)item->fields[from_field]->value);
field->value=(unsigned char*)RAPTOR_MALLOC(cstring, len + 1);
strncpy((char*)field->value, (const char*)item->fields[from_field]->value, len + 1);
}
}
}
static void
raptor_rss_uplift_items(raptor_parser* rdf_parser)
{
raptor_rss_parser* rss_parser=(raptor_rss_parser*)rdf_parser->context;
int i;
raptor_rss_item* item;
for(i=0; i< RAPTOR_RSS_COMMON_SIZE; i++) {
for(item=rss_parser->model.common[i]; item; item=item->next) {
raptor_rss_uplift_fields(item);
}
}
for(item=rss_parser->model.items; item; item=item->next) {
raptor_rss_uplift_fields(item);
}
}
static int
raptor_rss_parse_chunk(raptor_parser* rdf_parser,
const unsigned char *s, size_t len,
int is_end)
{
raptor_rss_parser* rss_parser=(raptor_rss_parser*)rdf_parser->context;
if(rdf_parser->failed)
return 1;
raptor_sax2_parse_chunk(rss_parser->sax2, s, len, is_end);
if(!is_end)
return 0;
if(rdf_parser->failed)
return 1;
/* turn strings into URIs, move things around if needed */
raptor_rss_insert_identifiers(rdf_parser);
/* add some new fields */
raptor_rss_uplift_items(rdf_parser);
/* generate the triples */
raptor_rss_emit(rdf_parser);
return 0;
}
static int
raptor_rss_parse_recognise_syntax(raptor_parser_factory* factory,
const unsigned char *buffer, size_t len,
const unsigned char *identifier,
const unsigned char *suffix,
const char *mime_type)
{
int score= 0;
if(suffix) {
if(!strcmp((const char*)suffix, "rss"))
score=7;
if(!strcmp((const char*)suffix, "atom"))
score=5;
if(!strcmp((const char*)suffix, "xml"))
score=4;
}
if(identifier) {
if(!strncmp((const char*)identifier, "http://feed", 11))
score+=5;
else if(strstr((const char*)identifier, "feed"))
score+=3;
if(strstr((const char*)identifier, "rss2"))
score+=5;
else if(!suffix && strstr((const char*)identifier, "rss"))
score+=4;
else if(!suffix && strstr((const char*)identifier, "atom"))
score+=4;
else if(strstr((const char*)identifier, "rss.xml"))
score+=4;
else if(strstr((const char*)identifier, "atom.xml"))
score+=4;
}
if(mime_type) {
if(!strstr((const char*)mime_type, "html")) {
if(strstr((const char*)mime_type, "rss"))
score+=4;
else if(strstr((const char*)mime_type, "xml"))
score+=4;
else if(strstr((const char*)mime_type, "atom"))
score+=4;
}
}
return score;
}
static int
raptor_rss_parser_register_factory(raptor_parser_factory *factory)
{
int rc=0;
factory->context_length = sizeof(raptor_rss_parser);
factory->need_base_uri = 1;
factory->init = raptor_rss_parse_init;
factory->terminate = raptor_rss_parse_terminate;
factory->start = raptor_rss_parse_start;
factory->chunk = raptor_rss_parse_chunk;
factory->recognise_syntax = raptor_rss_parse_recognise_syntax;
rc+= raptor_parser_factory_add_mime_type(factory, "application/rss", 10) != 0;
rc+= raptor_parser_factory_add_mime_type(factory, "application/rss+xml", 10) != 0;
rc+= raptor_parser_factory_add_mime_type(factory, "text/rss", 8) != 0;
rc+= raptor_parser_factory_add_mime_type(factory, "application/xml", 3) != 0;
rc+= raptor_parser_factory_add_mime_type(factory, "text/xml", 3) != 0;
return rc;
}
int
raptor_init_parser_rss(void)
{
return !raptor_parser_register_factory("rss-tag-soup", "RSS Tag Soup",
&raptor_rss_parser_register_factory);
}