/* -*- Mode: c; c-basic-offset: 2 -*- * * raptor_rss.c - Raptor RSS tag soup parser * * Copyright (C) 2003-2007, David Beckett http://purl.org/net/dajobe/ * Copyright (C) 2003-2005, University of Bristol, UK http://www.bristol.ac.uk/ * * Contributions: * Copyright (C) 2004-2005, Suzan Foster * * This package is Free Software and part of Redland http://librdf.org/ * * It is licensed under the following three licenses as alternatives: * 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version * 2. GNU General Public License (GPL) V2 or any newer version * 3. Apache License, V2.0 or any newer version * * You may not use this file except in compliance with at least one of * the above three licenses. * * See LICENSE.html or LICENSE.txt at the top of this package for the * complete terms and further detail along with the license texts for * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively. * * */ #ifdef HAVE_CONFIG_H #include #endif #ifdef WIN32 #include #endif #include #include #include #include #ifdef HAVE_ERRNO_H #include #endif /* Raptor includes */ #include "raptor.h" #include "raptor_internal.h" #include "raptor_rss.h" /* local prototypes */ static void raptor_rss_insert_identifiers(raptor_parser* rdf_parser); static void raptor_rss_uplift_items(raptor_parser* rdf_parser); static int raptor_rss_emit(raptor_parser* rdf_parser); static void raptor_rss_start_element_handler(void *user_data, raptor_xml_element* xml_element); static void raptor_rss_end_element_handler(void *user_data, raptor_xml_element* xml_element); static void raptor_rss_cdata_handler(void *user_data, raptor_xml_element* xml_element, const unsigned char *s, int len); static void raptor_rss_comment_handler(void *user_data, raptor_xml_element* xml_element, const unsigned char *s); /* * RSS parser object */ struct raptor_rss_parser_s { /* static model */ raptor_rss_model model; /* current line */ char *line; /* current line length */ int line_length; /* current char in line buffer */ int offset; /* static statement for use in passing to user code */ raptor_statement statement; raptor_sax2 *sax2; /* rss node type of current item */ raptor_rss_type current_type; /* one place stack */ raptor_rss_type prev_type; raptor_rss_fields_type current_field; /* emptyness of current element */ int element_is_empty; /* stack of namespaces */ raptor_namespace_stack *nstack; /* non-0 if this is an atom 1.0 parser */ int is_atom; /* namespaces declared here */ raptor_namespace* nspaces[RAPTOR_RSS_NAMESPACES_SIZE]; }; typedef struct raptor_rss_parser_s raptor_rss_parser; typedef enum { RAPTOR_RSS_CONTENT_TYPE_NONE, RAPTOR_RSS_CONTENT_TYPE_XML, RAPTOR_RSS_CONTENT_TYPE_TEXT } raptor_rss_content_type; struct raptor_rss_element_s { raptor_uri* uri; const unsigned char *rel; /* Two types of content */ raptor_rss_content_type type; /* 1) XML */ raptor_xml_writer* xml_writer; /* XML written to this iostream to the xml_content string */ raptor_iostream* iostream; /* ends up here */ void *xml_content; size_t xml_content_length; /* 2) cdata */ raptor_stringbuffer* sb; }; typedef struct raptor_rss_element_s raptor_rss_element; static void raptor_free_rss_element(raptor_rss_element *rss_element) { if(rss_element->uri) raptor_free_uri(rss_element->uri); if(rss_element->rel) raptor_free_memory((void*)rss_element->rel); if(rss_element->type == RAPTOR_RSS_CONTENT_TYPE_XML) { if(rss_element->xml_writer) raptor_free_xml_writer(rss_element->xml_writer); if(rss_element->iostream) raptor_free_iostream(rss_element->iostream); if(rss_element->xml_content) raptor_free_memory(rss_element->xml_content); } if(rss_element->sb) raptor_free_stringbuffer(rss_element->sb); RAPTOR_FREE(raptor_rss_element, rss_element); } static int raptor_rss_parse_init(raptor_parser* rdf_parser, const char *name) { raptor_rss_parser* rss_parser=(raptor_rss_parser*)rdf_parser->context; raptor_sax2* sax2; const raptor_uri_handler *uri_handler; void *uri_context; int n; raptor_rss_common_init(); raptor_rss_model_init(&rss_parser->model); rss_parser->prev_type=RAPTOR_RSS_NONE; rss_parser->current_field=RAPTOR_RSS_FIELD_NONE; rss_parser->current_type=RAPTOR_RSS_NONE; if(rss_parser->sax2) { raptor_free_sax2(rss_parser->sax2); rss_parser->sax2=NULL; } raptor_uri_get_handler(&uri_handler, &uri_context); rss_parser->nstack=raptor_new_namespaces(uri_handler, uri_context, NULL, NULL, /* errors */ 1); /* Initialise the namespaces */ for(n=0; n < RAPTOR_RSS_NAMESPACES_SIZE; n++) { unsigned const char* prefix=(unsigned const char*)raptor_rss_namespaces_info[n].prefix; raptor_uri* uri=raptor_rss_namespaces_info[n].uri; raptor_namespace* nspace=NULL; if(prefix && uri) nspace=raptor_new_namespace_from_uri(rss_parser->nstack, prefix, uri, 0); rss_parser->nspaces[n]=nspace; } sax2=raptor_new_sax2(rdf_parser, &rdf_parser->error_handlers); rss_parser->sax2=sax2; raptor_sax2_set_start_element_handler(sax2, raptor_rss_start_element_handler); raptor_sax2_set_end_element_handler(sax2, raptor_rss_end_element_handler); raptor_sax2_set_characters_handler(sax2, raptor_rss_cdata_handler); raptor_sax2_set_cdata_handler(sax2, raptor_rss_cdata_handler); raptor_sax2_set_comment_handler(sax2, raptor_rss_comment_handler); return 0; } static void raptor_rss_parse_terminate(raptor_parser *rdf_parser) { raptor_rss_parser *rss_parser=(raptor_rss_parser*)rdf_parser->context; int n; if(rss_parser->sax2) raptor_free_sax2(rss_parser->sax2); raptor_rss_model_clear(&rss_parser->model); /* Initialise the namespaces */ for(n=0; n < RAPTOR_RSS_NAMESPACES_SIZE; n++) { if(rss_parser->nspaces[n]) raptor_free_namespace(rss_parser->nspaces[n]); } if(rss_parser->nstack) raptor_free_namespaces(rss_parser->nstack); raptor_rss_common_terminate(); } static int raptor_rss_parse_start(raptor_parser *rdf_parser) { raptor_uri *uri=rdf_parser->base_uri; raptor_rss_parser* rss_parser=(raptor_rss_parser*)rdf_parser->context; /* base URI required for RSS */ if(!uri) return 1; /* Optionally forbid network requests in the XML parser */ raptor_sax2_set_feature(rss_parser->sax2, RAPTOR_FEATURE_NO_NET, rdf_parser->features[RAPTOR_FEATURE_NO_NET]); raptor_sax2_parse_start(rss_parser->sax2, uri); return 0; } static void raptor_rss_start_element_handler(void *user_data, raptor_xml_element* xml_element) { raptor_parser *rdf_parser; raptor_rss_parser *rss_parser; raptor_rss_enclosure *enclosure=NULL; raptor_uri* base_uri; raptor_qname *el_qname; const unsigned char *name; int ns_attributes_count; raptor_qname** named_attrs; const raptor_namespace* el_nspace; raptor_rss_element* rss_element; rss_element=(raptor_rss_element*)RAPTOR_CALLOC(raptor_rss_element, sizeof(raptor_rss_element), 1); rss_element->sb=raptor_new_stringbuffer(); xml_element->user_data=rss_element; if(xml_element->parent) { raptor_rss_element* parent_rss_element=(raptor_rss_element*)(xml_element->parent->user_data); if(parent_rss_element->xml_writer) rss_element->xml_writer=parent_rss_element->xml_writer; } if(rss_element->xml_writer) { raptor_xml_writer_start_element(rss_element->xml_writer, xml_element); return; } el_qname=raptor_xml_element_get_name(xml_element); name=el_qname->local_name; el_nspace=el_qname->nspace; rdf_parser=(raptor_parser*)user_data; rss_parser=(raptor_rss_parser*)rdf_parser->context; base_uri=raptor_sax2_inscope_base_uri(rss_parser->sax2); if(rss_parser->current_type == RAPTOR_RSS_NONE) { if(!strcmp((const char*)name, "rss") || !strcmp((const char*)name, "rdf") || !strcmp((const char*)name, "RDF")) { /* rss */ goto check_attributes; } else if(!raptor_strcasecmp((const char*)name, "channel")) { /* rss or atom 0.3 channel */ rss_parser->current_type=RAPTOR_RSS_CHANNEL; } else if(!strcmp((const char*)name, "feed")) { /* atom 1.0 feed */ rss_parser->current_type=RAPTOR_RSS_CHANNEL; rss_parser->is_atom=1; } else if(!strcmp((const char*)name, "item")) { raptor_rss_model_add_item(&rss_parser->model); rss_parser->current_type=RAPTOR_RSS_ITEM; } else if(!strcmp((const char*)name, "entry")) { raptor_rss_model_add_item(&rss_parser->model); rss_parser->current_type=RAPTOR_RSS_ITEM; rss_parser->is_atom=1; } else { int i; rss_parser->current_type=RAPTOR_RSS_UNKNOWN; for(i=0; icurrent_type=(raptor_rss_type)i; break; } } if(rss_parser->current_type == RAPTOR_RSS_UNKNOWN) { RAPTOR_DEBUG2("Unknown start element named %s\n", name); } else { RAPTOR_DEBUG3("FOUND type %d - %s\n", rss_parser->current_type, raptor_rss_types_info[rss_parser->current_type].name); if (rss_parser->current_type != RAPTOR_RSS_ITEM) raptor_rss_model_add_common(&rss_parser->model, rss_parser->current_type); } } else { /* have current_type, this is an element inside */ int i; raptor_rss_type old_type=rss_parser->current_type; /* check it is not a type here */ if(!strcmp((const char*)name, "item") || !strcmp((const char*)name, "entry")) { raptor_rss_model_add_item(&rss_parser->model); rss_parser->current_type=RAPTOR_RSS_ITEM; } else { for(i=0; iis_atom)) { rss_parser->current_type=(raptor_rss_type)i; break; } } } if(rss_parser->current_type != old_type) { RAPTOR_DEBUG6("FOUND element %s for type %d - %s INSIDE current type %d - %s\n", name, rss_parser->current_type, raptor_rss_types_info[rss_parser->current_type].name, old_type, raptor_rss_types_info[old_type].name); raptor_rss_model_add_common(&rss_parser->model, rss_parser->current_type); rss_parser->prev_type=old_type; goto check_attributes; } rss_parser->current_field=RAPTOR_RSS_FIELD_UNKNOWN; for(i=0; i RSS 1.0 namespace */ if(nspace_URI && (raptor_uri_equals(nspace_URI, raptor_rss_namespaces_info[RSS0_9_NS].uri) || raptor_uri_equals(nspace_URI, raptor_rss_namespaces_info[RSS1_1_NS].uri))) { nspace_URI=raptor_rss_namespaces_info[RSS1_0_NS].uri; } /* Atom 0.3 namespace => Atom 1.0 namespace */ if(nspace_URI && raptor_uri_equals(nspace_URI, raptor_rss_namespaces_info[ATOM0_3_NS].uri)) { nspace_URI=raptor_rss_namespaces_info[ATOM1_0_NS].uri; } if(nspace_URI && raptor_rss_fields_info[i].nspace != RSS_NO_NS) { raptor_uri* field_nspace_URI=raptor_rss_namespaces_info[raptor_rss_fields_info[i].nspace].uri; if(raptor_uri_equals(nspace_URI, field_nspace_URI)) { rss_parser->current_field=(raptor_rss_fields_type)i; break; } } else { rss_parser->current_field=(raptor_rss_fields_type)i; break; } } if(rss_parser->current_field==RAPTOR_RSS_FIELD_UNKNOWN) { RAPTOR_DEBUG3("Unknown field element named %s inside type %s\n", name, raptor_rss_types_info[rss_parser->current_type].name); } else if (rss_parser->current_field == RAPTOR_RSS_FIELD_ENCLOSURE ){ raptor_rss_item* update_item; RAPTOR_DEBUG1("FOUND new enclosure\n"); if(rss_parser->current_type == RAPTOR_RSS_ITEM) { update_item=rss_parser->model.last; enclosure=raptor_rss_new_enclosure(); raptor_rss_item_add_enclosure(update_item, enclosure); } } else { RAPTOR_DEBUG4("FOUND field %d - %s inside type %s\n", rss_parser->current_field, raptor_rss_fields_info[rss_parser->current_field].name, raptor_rss_types_info[rss_parser->current_type].name); /* Rewrite item fields */ for(i=0; raptor_atom_to_rss[i].from != RAPTOR_RSS_FIELD_UNKNOWN; i++) { if(raptor_atom_to_rss[i].from == rss_parser->current_field) { rss_parser->current_field=raptor_atom_to_rss[i].to; RAPTOR_DEBUG3("Rewrote into field %d - %s\n", rss_parser->current_field, raptor_rss_fields_info[rss_parser->current_field].name); break; } } } } check_attributes: named_attrs=raptor_xml_element_get_attributes(xml_element); ns_attributes_count=raptor_xml_element_get_attributes_count(xml_element); /* Now check for attributes */ if(named_attrs && ns_attributes_count) { int i; for (i = 0; i < ns_attributes_count; i++) { raptor_qname* attr=named_attrs[i]; const unsigned char* attrName = attr->local_name; const unsigned char* attrValue = attr->value; RAPTOR_DEBUG3(" attribute %s=%s\n", attrName, attrValue); /* Pick a few attributes to care about */ if(!strcmp((const char*)attrName, "isPermaLink")) { raptor_rss_item* update_item=rss_parser->model.last; if(!strcmp((const char*)name, "guid")) { /* */ if(update_item) { raptor_rss_field* field=raptor_rss_new_field(); RAPTOR_DEBUG1("fa1 - "); raptor_rss_item_add_field(update_item, RAPTOR_RSS_FIELD_GUID, field); if(!strcmp((const char*)attrValue, "true")) { RAPTOR_DEBUG2(" setting guid to URI '%s'\n", attrValue); field->uri=raptor_new_uri_relative_to_base(base_uri, (const unsigned char*)attrValue); } else { size_t len=strlen((const char*)attrValue); RAPTOR_DEBUG2(" setting guid to string '%s'\n", attrValue); field->value=(unsigned char*)RAPTOR_MALLOC(cstring, len+1); strncpy((char*)field->value, (char*)attrValue, len+1); } } } } else if(!strcmp((const char*)attrName, "url")) { if(!strcmp((const char*)name, "source")) { /* foo */ if(rss_parser->model.last) { /* rss_parser->last->source_url=attrValue; attrValue=NULL; */ } } else if (!strcmp((const char*)name, "enclosure") && enclosure) { RAPTOR_DEBUG2(" setting enclosure URL %s\n", attrValue); enclosure->url=raptor_new_uri_relative_to_base(base_uri, (const unsigned char*)attrValue); } } else if(!strcmp((const char*)attrName, "domain")) { if(!strcmp((const char*)name, "category")) { /* foo */ if(rss_parser->model.last) { /* rss_parser->last->category_url=attrValue; attrValue=NULL; */ } } } else if(!strcmp((const char*)attrName, "rel")) { size_t len=strlen((const char*)attrValue); RAPTOR_DEBUG2(" setting rel length %s\n", attrValue); rss_element->rel=(unsigned char*)RAPTOR_MALLOC(cstring, len+1); strncpy((char*)rss_element->rel, (const char*)attrValue, len+1); attrValue=NULL; } else if(!strcmp((const char*)attrName, "href")) { if(rss_parser->current_field == RAPTOR_RSS_FIELD_LINK || rss_parser->current_field == RAPTOR_RSS_FIELD_ATOM_LINK) { RAPTOR_DEBUG2(" setting href as URI string for type %s\n", raptor_rss_types_info[rss_parser->current_type].name); if(rss_element->uri) raptor_free_uri(rss_element->uri); rss_element->uri=raptor_new_uri_relative_to_base(base_uri, (const unsigned char*)attrValue); } } else if (!strcmp((const char*)attrName, "length")) { if (!strcmp((const char*)name, "enclosure") && enclosure) { size_t len=strlen((const char*)attrValue); RAPTOR_DEBUG2(" setting enclosure length %s\n", attrValue); enclosure->length=(char*)RAPTOR_MALLOC(cstring, len+1); strncpy(enclosure->length, (char*)attrValue, len+1); } } else if (!strcmp((const char*)attrName, "type")) { if (!strcmp((const char*)name, "enclosure") && enclosure) { size_t len=strlen((const char*)attrValue); RAPTOR_DEBUG2(" setting enclosure type %s\n", attrValue); enclosure->type=(char*)RAPTOR_MALLOC(cstring, len+1); strncpy(enclosure->type, (char*)attrValue, len+1); } else if(rss_parser->current_field == RAPTOR_RSS_FIELD_ATOM_LINK) { /* do nothing with atom link attribute type */ } else if(rss_parser->is_atom) { /* Atom only typing */ if (!strcmp((const char*)attrValue, "xhtml") || !strcmp((const char*)attrValue, "xml") || strstr((const char*)attrValue, "+xml")) { const raptor_uri_handler *uri_handler; void *uri_context; RAPTOR_DEBUG2(" found type '%s', making an XML writer\n", attrValue); raptor_uri_get_handler(&uri_handler, &uri_context); rss_element->type=RAPTOR_RSS_CONTENT_TYPE_XML; rss_element->iostream=raptor_new_iostream_to_string(&rss_element->xml_content, &rss_element->xml_content_length, raptor_alloc_memory); rss_element->xml_writer=raptor_new_xml_writer(NULL, uri_handler, uri_context, rss_element->iostream, (raptor_simple_message_handler)raptor_parser_simple_error, rdf_parser, 1); raptor_xml_writer_set_feature(rss_element->xml_writer, RAPTOR_FEATURE_WRITER_XML_DECLARATION, 0); raptor_free_stringbuffer(rss_element->sb); rss_element->sb=NULL; } } } else if (!strcmp((const char*)attrName, "version")) { if(!raptor_strcasecmp((const char*)name, "feed")) { if(!strcmp((const char*)attrValue, "0.3")) rss_parser->is_atom=1; } } } } /* if have attributes */ } static void raptor_rss_end_element_handler(void *user_data, raptor_xml_element* xml_element) { raptor_parser* rdf_parser; raptor_rss_parser* rss_parser; #ifdef RAPTOR_DEBUG const unsigned char* name=raptor_xml_element_get_name(xml_element)->local_name; #endif raptor_rss_element* rss_element; size_t cdata_len=0; unsigned char* cdata=NULL; rss_element=(raptor_rss_element*)xml_element->user_data; rdf_parser=(raptor_parser*)user_data; rss_parser=(raptor_rss_parser*)rdf_parser->context; if(rss_element->xml_writer) { if(rss_element->type != RAPTOR_RSS_CONTENT_TYPE_XML) { raptor_xml_writer_end_element(rss_element->xml_writer, xml_element); goto tidy_end_element; } /* otherwise we are done making XML */ raptor_free_iostream(rss_element->iostream); rss_element->iostream=NULL; cdata=(unsigned char*)rss_element->xml_content; cdata_len=rss_element->xml_content_length; } if(rss_element->sb) { cdata_len=raptor_stringbuffer_length(rss_element->sb); cdata=raptor_stringbuffer_as_string(rss_element->sb); } if(cdata) { raptor_uri* base_uri=NULL; base_uri=raptor_sax2_inscope_base_uri(rss_parser->sax2); if((rss_parser->current_type==RAPTOR_RSS_NONE || rss_parser->current_type==RAPTOR_RSS_UNKNOWN) || (rss_parser->current_field==RAPTOR_RSS_FIELD_NONE || rss_parser->current_field==RAPTOR_RSS_FIELD_UNKNOWN)) { unsigned char *p=cdata; int i; for(i=cdata_len; i>0 && *p; i--) { if(!isspace(*p)) break; p++; } if(i>0 && *p) { RAPTOR_DEBUG4("IGNORING non-whitespace text '%s' inside type %s, field %s\n", cdata, raptor_rss_types_info[rss_parser->current_type].name, raptor_rss_fields_info[rss_parser->current_field].name); } goto do_end_element; } if(rss_parser->current_type >= RAPTOR_RSS_COMMON_IGNORED) { /* skipHours, skipDays common but IGNORED */ RAPTOR_DEBUG2("Ignoring fields for type %s\n", raptor_rss_types_info[rss_parser->current_type].name); } else { raptor_rss_item* update_item; raptor_rss_field* field=raptor_rss_new_field(); if(rss_parser->current_type == RAPTOR_RSS_ITEM) update_item=rss_parser->model.last; else update_item=raptor_rss_model_get_common(&rss_parser->model, rss_parser->current_type); /* if value is always an uri, make it so */ if(raptor_rss_fields_info[rss_parser->current_field].flags & RAPTOR_RSS_INFO_FLAG_URI_VALUE) { RAPTOR_DEBUG4("Added URI %s to field %s of type %s\n", cdata, raptor_rss_fields_info[rss_parser->current_field].name, raptor_rss_types_info[rss_parser->current_type].name); field->uri=raptor_new_uri_relative_to_base(base_uri, cdata); } else { RAPTOR_DEBUG4("Added text '%s' to field %s of type %s\n", cdata, raptor_rss_fields_info[rss_parser->current_field].name, raptor_rss_types_info[rss_parser->current_type].name); field->uri=NULL; field->value=(unsigned char*)RAPTOR_MALLOC(cstring, cdata_len+1); strncpy((char*)field->value, (const char*)cdata, cdata_len); field->value[cdata_len]='\0'; } RAPTOR_DEBUG1("fa3 - "); raptor_rss_item_add_field(update_item, rss_parser->current_field, field); } } /* end if contained cdata */ if(raptor_xml_element_is_empty(xml_element)) { /* Empty element, so consider adding one of the attributes as * literal or URI content */ if(rss_parser->current_type >= RAPTOR_RSS_COMMON_IGNORED) { /* skipHours, skipDays common but IGNORED */ RAPTOR_DEBUG3("Ignoring empty element %s for type %s\n", name, raptor_rss_types_info[rss_parser->current_type].name); } else if(rss_element->uri) { raptor_rss_item* update_item; raptor_rss_field* field=raptor_rss_new_field(); if(rss_parser->current_type == RAPTOR_RSS_ITEM) update_item=rss_parser->model.last; else update_item=raptor_rss_model_get_common(&rss_parser->model, rss_parser->current_type); if(rss_parser->current_field == RAPTOR_RSS_FIELD_LINK && rss_element->rel && !strcmp((const char*)rss_element->rel, "alternate")) { /* RSS with rel != alternate ignored FIXME */ } else if(rss_parser->current_field == RAPTOR_RSS_FIELD_UNKNOWN) { RAPTOR_DEBUG2("Cannot add URI from alternate attribute to type %s unknown field\n", raptor_rss_types_info[rss_parser->current_type].name); raptor_rss_field_free(field); } else { RAPTOR_DEBUG3("Added URI to field %s of type %s\n", raptor_rss_fields_info[rss_parser->current_field].name, raptor_rss_types_info[rss_parser->current_type].name); field->uri=rss_element->uri; rss_element->uri=NULL; RAPTOR_DEBUG1("fa2 - "); raptor_rss_item_add_field(update_item, rss_parser->current_field, field); } } } do_end_element: if(rss_parser->current_type != RAPTOR_RSS_NONE) { if(rss_parser->current_field != RAPTOR_RSS_FIELD_NONE) { RAPTOR_DEBUG3("Ending element %s field %s\n", name, raptor_rss_fields_info[rss_parser->current_field].name); rss_parser->current_field= RAPTOR_RSS_FIELD_NONE; } else { RAPTOR_DEBUG3("Ending element %s type %s\n", name, raptor_rss_types_info[rss_parser->current_type].name); if(rss_parser->prev_type != RAPTOR_RSS_NONE) { rss_parser->current_type=rss_parser->prev_type; rss_parser->prev_type=RAPTOR_RSS_NONE; RAPTOR_DEBUG3("Returning to type %d - %s\n", rss_parser->current_type, raptor_rss_types_info[rss_parser->current_type].name); } else rss_parser->current_type= RAPTOR_RSS_NONE; } } tidy_end_element: if(rss_element) raptor_free_rss_element(rss_element); } static void raptor_rss_cdata_handler(void *user_data, raptor_xml_element* xml_element, const unsigned char *s, int len) { raptor_rss_element* rss_element; rss_element=(raptor_rss_element*)xml_element->user_data; if(rss_element->xml_writer) { raptor_xml_writer_cdata_counted(rss_element->xml_writer, s, len); return; } raptor_stringbuffer_append_counted_string(rss_element->sb, s, len, 1); } static void raptor_rss_comment_handler(void *user_data, raptor_xml_element* xml_element, const unsigned char *s) { raptor_rss_element* rss_element; if(!xml_element) return; rss_element=(raptor_rss_element*)xml_element->user_data; if(rss_element->xml_writer) { raptor_xml_writer_comment(rss_element->xml_writer, s); return; } } static void raptor_rss_insert_enclosure_identifiers(raptor_parser* rdf_parser, raptor_rss_enclosure *enclosure) { raptor_identifier* identifier=&enclosure->identifier; if (enclosure->url) { /* emit as URI resource */ identifier->uri=raptor_uri_copy(enclosure->url); identifier->type=RAPTOR_IDENTIFIER_TYPE_RESOURCE; identifier->uri_source=RAPTOR_URI_SOURCE_URI; } else { /* emit as blank node */ identifier->id=raptor_parser_internal_generate_id(rdf_parser, RAPTOR_GENID_TYPE_BNODEID, NULL); identifier->type=RAPTOR_IDENTIFIER_TYPE_ANONYMOUS; identifier->uri_source=RAPTOR_URI_SOURCE_GENERATED; } enclosure->node_type=raptor_rss_types_info[RAPTOR_RSS_ENCLOSURE].uri; } static void raptor_rss_insert_identifiers(raptor_parser* rdf_parser) { raptor_rss_parser* rss_parser=(raptor_rss_parser*)rdf_parser->context; int i; raptor_rss_item* item; for(i=0; i< RAPTOR_RSS_COMMON_SIZE; i++) { for(item=rss_parser->model.common[i]; item; item=item->next) { raptor_identifier* identifier; identifier=&(item->identifier); if(!item->fields_count) continue; RAPTOR_DEBUG3("Inserting identifiers in common type %d - %s\n", i, raptor_rss_types_info[i].name); if(item->uri) { identifier->uri=raptor_uri_copy(item->uri); identifier->type=RAPTOR_IDENTIFIER_TYPE_RESOURCE; identifier->uri_source=RAPTOR_URI_SOURCE_URI; } else { int url_fields[2]; int url_fields_count=1; int f; url_fields[0]=(i== RAPTOR_RSS_IMAGE) ? RAPTOR_RSS_FIELD_URL : RAPTOR_RSS_FIELD_LINK; if(i == RAPTOR_RSS_CHANNEL) { url_fields[1]=RAPTOR_RSS_FIELD_ATOM_ID; url_fields_count++; } for(f=0; f < url_fields_count; f++) { raptor_rss_field* field; for(field=item->fields[url_fields[f]]; field; field=field->next) { if(field->value) { identifier->uri=raptor_new_uri((const unsigned char*)field->value); identifier->type=RAPTOR_IDENTIFIER_TYPE_RESOURCE; identifier->uri_source=RAPTOR_URI_SOURCE_URI; break; } else if(field->uri) { identifier->uri=raptor_uri_copy(field->uri); identifier->type=RAPTOR_IDENTIFIER_TYPE_RESOURCE; identifier->uri_source=RAPTOR_URI_SOURCE_URI; break; } } } if(!identifier->uri) { /* need to make bnode */ identifier->id=raptor_parser_internal_generate_id(rdf_parser, RAPTOR_GENID_TYPE_BNODEID, NULL); identifier->type=RAPTOR_IDENTIFIER_TYPE_ANONYMOUS; identifier->uri_source=RAPTOR_URI_SOURCE_GENERATED; } } item->node_type=&raptor_rss_types_info[i]; } } /* sequence of rss:item */ for(item=rss_parser->model.items; item; item=item->next) { raptor_identifier* identifier=&item->identifier; raptor_rss_enclosure* enclosure; if(item->uri) { identifier->uri=raptor_uri_copy(item->uri); identifier->type=RAPTOR_IDENTIFIER_TYPE_RESOURCE; identifier->uri_source=RAPTOR_URI_SOURCE_URI; } else { if (item->fields[RAPTOR_RSS_FIELD_LINK]) { if (item->fields[RAPTOR_RSS_FIELD_LINK]->value) { identifier->uri=raptor_new_uri((const unsigned char*)item->fields[RAPTOR_RSS_FIELD_LINK]->value); identifier->type=RAPTOR_IDENTIFIER_TYPE_RESOURCE; identifier->uri_source=RAPTOR_URI_SOURCE_URI; } else if(item->fields[RAPTOR_RSS_FIELD_LINK]->uri) { identifier->uri=raptor_uri_copy(item->fields[RAPTOR_RSS_FIELD_LINK]->uri); identifier->type=RAPTOR_IDENTIFIER_TYPE_RESOURCE; identifier->uri_source=RAPTOR_URI_SOURCE_URI; } } else if(item->fields[RAPTOR_RSS_FIELD_ATOM_ID]) { if (item->fields[RAPTOR_RSS_FIELD_ATOM_ID]->value) { identifier->uri=raptor_new_uri((const unsigned char*)item->fields[RAPTOR_RSS_FIELD_ATOM_ID]->value); identifier->type=RAPTOR_IDENTIFIER_TYPE_RESOURCE; identifier->uri_source=RAPTOR_URI_SOURCE_URI; } else if(item->fields[RAPTOR_RSS_FIELD_ATOM_ID]->uri) { identifier->uri=raptor_uri_copy(item->fields[RAPTOR_RSS_FIELD_ATOM_ID]->uri); identifier->type=RAPTOR_IDENTIFIER_TYPE_RESOURCE; identifier->uri_source=RAPTOR_URI_SOURCE_URI; } } else { /* need to make bnode */ identifier->id=raptor_parser_internal_generate_id(rdf_parser, RAPTOR_GENID_TYPE_BNODEID, NULL); identifier->type=RAPTOR_IDENTIFIER_TYPE_ANONYMOUS; identifier->uri_source=RAPTOR_URI_SOURCE_GENERATED; } } for(enclosure=item->enclosure; enclosure; enclosure=enclosure->next) raptor_rss_insert_enclosure_identifiers(rdf_parser, enclosure); item->node_type=&raptor_rss_types_info[RAPTOR_RSS_ITEM]; } } static int raptor_rss_emit_type_triple(raptor_parser* rdf_parser, raptor_identifier *resource, raptor_uri *type_uri) { raptor_rss_parser* rss_parser=(raptor_rss_parser*)rdf_parser->context; if(!resource->uri && !resource->id) { raptor_parser_error(rdf_parser, "RSS node has no identifier"); return 1; } rss_parser->statement.subject=resource->uri ? (void*)resource->uri : (void*)resource->id; rss_parser->statement.subject_type=resource->type; rss_parser->statement.predicate=RAPTOR_RSS_RDF_type_URI(&rss_parser->model); rss_parser->statement.predicate_type=RAPTOR_IDENTIFIER_TYPE_RESOURCE; rss_parser->statement.object=(void*)type_uri; rss_parser->statement.object_type=RAPTOR_IDENTIFIER_TYPE_RESOURCE; rss_parser->statement.object_literal_language=NULL; rss_parser->statement.object_literal_datatype=NULL; /* Generate the statement */ (*rdf_parser->statement_handler)(rdf_parser->user_data, &rss_parser->statement); return 0; } static int raptor_rss_emit_enclosure(raptor_parser* rdf_parser, raptor_rss_enclosure *enclosure) { raptor_rss_parser* rss_parser=(raptor_rss_parser*)rdf_parser->context; raptor_identifier* identifier=&enclosure->identifier; const void* subject=rss_parser->statement.subject; if(!identifier->uri && !identifier->id) { raptor_parser_error(rdf_parser, "Enclosure has no identifier"); return 1; } rss_parser->statement.predicate=raptor_rss_fields_info[RAPTOR_RSS_RDF_ENCLOSURE].uri; rss_parser->statement.predicate_type=RAPTOR_IDENTIFIER_TYPE_RESOURCE; if (identifier->uri) { /* emit as resource */ rss_parser->statement.object=identifier->uri; rss_parser->statement.object_type=RAPTOR_IDENTIFIER_TYPE_RESOURCE; } else { /* emit as blank node */ rss_parser->statement.object=identifier->id; rss_parser->statement.object_type=RAPTOR_IDENTIFIER_TYPE_ANONYMOUS; } rss_parser->statement.object_literal_language=NULL; rss_parser->statement.object_literal_datatype=NULL; (*rdf_parser->statement_handler)(rdf_parser->user_data, &rss_parser->statement); if(raptor_rss_emit_type_triple(rdf_parser, identifier, enclosure->node_type)) return 1; if (enclosure->url) { rss_parser->statement.predicate=raptor_rss_fields_info[RAPTOR_RSS_RDF_ENCLOSURE_URL].uri; rss_parser->statement.object=enclosure->url; rss_parser->statement.object_type=RAPTOR_IDENTIFIER_TYPE_RESOURCE; (*rdf_parser->statement_handler)(rdf_parser->user_data, &rss_parser->statement); } if (enclosure->type) { rss_parser->statement.predicate=raptor_rss_fields_info[RAPTOR_RSS_RDF_ENCLOSURE_TYPE].uri; rss_parser->statement.object=enclosure->type; rss_parser->statement.object_type=RAPTOR_IDENTIFIER_TYPE_LITERAL; (*rdf_parser->statement_handler)(rdf_parser->user_data, &rss_parser->statement); } if (enclosure->length) { rss_parser->statement.predicate=raptor_rss_fields_info[RAPTOR_RSS_RDF_ENCLOSURE_LENGTH].uri; rss_parser->statement.object=enclosure->length; rss_parser->statement.object_type=RAPTOR_IDENTIFIER_TYPE_LITERAL; (*rdf_parser->statement_handler)(rdf_parser->user_data, &rss_parser->statement); } rss_parser->statement.subject=subject; return 0; } static int raptor_rss_emit_item(raptor_parser* rdf_parser, raptor_rss_item *item) { raptor_rss_parser* rss_parser=(raptor_rss_parser*)rdf_parser->context; int f; raptor_identifier* identifier=&item->identifier; raptor_rss_enclosure* enclosure; if(!item->fields_count) return 0; if(raptor_rss_emit_type_triple(rdf_parser, identifier, item->node_type->uri)) return 1; for(f=0; f< RAPTOR_RSS_FIELDS_SIZE; f++) { raptor_rss_field* field; /* This is only made by a connection */ if(f == RAPTOR_RSS_FIELD_ITEMS) continue; rss_parser->statement.predicate=raptor_rss_fields_info[f].uri; if(!rss_parser->statement.predicate) continue; rss_parser->statement.predicate_type=RAPTOR_IDENTIFIER_TYPE_RESOURCE; for (field=item->fields[f]; field; field=field->next) { rss_parser->statement.object_literal_language=NULL; rss_parser->statement.object_literal_datatype=NULL; if(field->value) { rss_parser->statement.object=field->value; rss_parser->statement.object_type=RAPTOR_IDENTIFIER_TYPE_LITERAL; /* FIXME - should store and emit languages */ } else { rss_parser->statement.object=field->uri; rss_parser->statement.object_type=RAPTOR_IDENTIFIER_TYPE_RESOURCE; } /* Generate the statement */ (*rdf_parser->statement_handler)(rdf_parser->user_data, &rss_parser->statement); } } for(enclosure=item->enclosure; enclosure; enclosure=enclosure->next) { raptor_rss_emit_enclosure(rdf_parser, enclosure); } return 0; } static int raptor_rss_emit_connection(raptor_parser* rdf_parser, raptor_identifier *subject_identifier, raptor_uri predicate_uri, int predicate_ordinal, raptor_identifier *object_identifier) { raptor_rss_parser* rss_parser=(raptor_rss_parser*)rdf_parser->context; if(!subject_identifier->uri && !subject_identifier->id) { raptor_parser_error(rdf_parser, "Connection subject has no identifier"); return 1; } rss_parser->statement.subject=subject_identifier->uri ? (void*)subject_identifier->uri : (void*)subject_identifier->id; rss_parser->statement.subject_type=subject_identifier->type; if(predicate_uri) { rss_parser->statement.predicate=predicate_uri; rss_parser->statement.predicate_type=RAPTOR_IDENTIFIER_TYPE_RESOURCE; } else { rss_parser->statement.predicate=(void*)&predicate_ordinal; rss_parser->statement.predicate_type=RAPTOR_IDENTIFIER_TYPE_ORDINAL; } rss_parser->statement.object=object_identifier->uri ? (void*)object_identifier->uri : (void*)object_identifier->id; rss_parser->statement.object_type=object_identifier->type; rss_parser->statement.object_literal_language=NULL; rss_parser->statement.object_literal_datatype=NULL; /* Generate the statement */ (*rdf_parser->statement_handler)(rdf_parser->user_data, &rss_parser->statement); return 0; } static int raptor_rss_emit(raptor_parser* rdf_parser) { raptor_rss_parser* rss_parser=(raptor_rss_parser*)rdf_parser->context; int i; raptor_rss_item* item; if (!rss_parser->model.common[RAPTOR_RSS_CHANNEL]) { raptor_parser_error(rdf_parser, "No RSS channel item present"); return 1; } if(!rss_parser->model.common[RAPTOR_RSS_CHANNEL]->identifier.uri && !rss_parser->model.common[RAPTOR_RSS_CHANNEL]->identifier.id) { raptor_parser_error(rdf_parser, "RSS channel has no identifier"); return 1; } for (i=0; i< RAPTOR_RSS_COMMON_SIZE; i++) { for (item=rss_parser->model.common[i]; item; item=item->next) { if(!item->fields_count) continue; RAPTOR_DEBUG3("Emitting type %i - %s\n", i, raptor_rss_types_info[i].name); if(!item->identifier.uri && !item->identifier.id) { raptor_parser_error(rdf_parser, "RSS %s has no identifier", raptor_rss_types_info[i].name); return 1; } if(raptor_rss_emit_item(rdf_parser, item)) return 1; /* Add connections to channel */ if(i != RAPTOR_RSS_CHANNEL) { if(raptor_rss_emit_connection(rdf_parser, &(rss_parser->model.common[RAPTOR_RSS_CHANNEL]->identifier), raptor_rss_types_info[i].uri, 0, &(item->identifier))) return 1; } } } if(rss_parser->model.items_count) { raptor_identifier *items; /* make a new genid for the node */ items=raptor_new_identifier(RAPTOR_IDENTIFIER_TYPE_ANONYMOUS, NULL, RAPTOR_URI_SOURCE_GENERATED, (const unsigned char*)raptor_parser_internal_generate_id(rdf_parser, RAPTOR_GENID_TYPE_BNODEID, NULL), NULL, NULL, NULL); /* _:genid1 rdf:type rdf:Seq . */ if(raptor_rss_emit_type_triple(rdf_parser, items, RAPTOR_RSS_RDF_Seq_URI(&rss_parser->model))) { raptor_free_identifier(items); return 1; } /* rss:items _:genid1 . */ if(raptor_rss_emit_connection(rdf_parser, &(rss_parser->model.common[RAPTOR_RSS_CHANNEL]->identifier), raptor_rss_fields_info[RAPTOR_RSS_FIELD_ITEMS].uri, 0, items)) { raptor_free_identifier(items); return 1; } /* sequence of rss:item */ for(i=1, item=rss_parser->model.items; item; item=item->next, i++) { if(raptor_rss_emit_item(rdf_parser, item) || raptor_rss_emit_connection(rdf_parser, items, NULL, i, &(item->identifier))) { raptor_free_identifier(items); return 1; } } raptor_free_identifier(items); } return 0; } static const raptor_field_pair raptor_rss_uplift_map[]={ /* from */ /* to */ #ifdef RAPTOR_PARSEDATE_FUNCTION /* convert to ISO date */ { RAPTOR_RSS_FIELD_PUBDATE, RAPTOR_RSS_FIELD_DC_DATE }, /* default action: copy fields */ { RAPTOR_RSS_FIELD_ATOM_UPDATED, RAPTOR_RSS_FIELD_DC_DATE }, #endif /* default actions: copy fields */ { RAPTOR_RSS_FIELD_DESCRIPTION, RAPTOR_RSS_FIELD_CONTENT_ENCODED }, { RAPTOR_RSS_FIELD_UNKNOWN, RAPTOR_RSS_FIELD_UNKNOWN } }; static void raptor_rss_uplift_fields(raptor_rss_item* item) { int i; for(i=0; raptor_rss_uplift_map[i].from != RAPTOR_RSS_FIELD_UNKNOWN; i++) { raptor_rss_fields_type from_field=raptor_rss_uplift_map[i].from; raptor_rss_fields_type to_field=raptor_rss_uplift_map[i].to; raptor_rss_field* field=NULL; size_t len; if(!(item->fields[from_field] && item->fields[from_field]->value)) continue; if(from_field == to_field) { field=item->fields[from_field]; } else { if(item->fields[to_field] && item->fields[to_field]->value) continue; field=raptor_rss_new_field(); raptor_rss_item_add_field(item, to_field, field); } #ifdef RAPTOR_PARSEDATE_FUNCTION /* Get rid of date soup */ if(from_field == RAPTOR_RSS_FIELD_PUBDATE #if 0 /* or normalize to UTC */ || from_field == RAPTOR_RSS_FIELD_ATOM_PUBLISHED || from_field == RAPTOR_RSS_FIELD_ATOM_UPDATED #endif ) raptor_rss_date_uplift(field, item->fields[from_field]->value); #endif if(!field->value) { /* Otherwise default action is to copy from_field value */ len=strlen((const char*)item->fields[from_field]->value); field->value=(unsigned char*)RAPTOR_MALLOC(cstring, len + 1); strncpy((char*)field->value, (const char*)item->fields[from_field]->value, len + 1); } } } static void raptor_rss_uplift_items(raptor_parser* rdf_parser) { raptor_rss_parser* rss_parser=(raptor_rss_parser*)rdf_parser->context; int i; raptor_rss_item* item; for(i=0; i< RAPTOR_RSS_COMMON_SIZE; i++) { for(item=rss_parser->model.common[i]; item; item=item->next) { raptor_rss_uplift_fields(item); } } for(item=rss_parser->model.items; item; item=item->next) { raptor_rss_uplift_fields(item); } } static int raptor_rss_parse_chunk(raptor_parser* rdf_parser, const unsigned char *s, size_t len, int is_end) { raptor_rss_parser* rss_parser=(raptor_rss_parser*)rdf_parser->context; if(rdf_parser->failed) return 1; raptor_sax2_parse_chunk(rss_parser->sax2, s, len, is_end); if(!is_end) return 0; if(rdf_parser->failed) return 1; /* turn strings into URIs, move things around if needed */ raptor_rss_insert_identifiers(rdf_parser); /* add some new fields */ raptor_rss_uplift_items(rdf_parser); /* generate the triples */ raptor_rss_emit(rdf_parser); return 0; } static int raptor_rss_parse_recognise_syntax(raptor_parser_factory* factory, const unsigned char *buffer, size_t len, const unsigned char *identifier, const unsigned char *suffix, const char *mime_type) { int score= 0; if(suffix) { if(!strcmp((const char*)suffix, "rss")) score=7; if(!strcmp((const char*)suffix, "atom")) score=5; if(!strcmp((const char*)suffix, "xml")) score=4; } if(identifier) { if(!strncmp((const char*)identifier, "http://feed", 11)) score+=5; else if(strstr((const char*)identifier, "feed")) score+=3; if(strstr((const char*)identifier, "rss2")) score+=5; else if(!suffix && strstr((const char*)identifier, "rss")) score+=4; else if(!suffix && strstr((const char*)identifier, "atom")) score+=4; else if(strstr((const char*)identifier, "rss.xml")) score+=4; else if(strstr((const char*)identifier, "atom.xml")) score+=4; } if(mime_type) { if(!strstr((const char*)mime_type, "html")) { if(strstr((const char*)mime_type, "rss")) score+=4; else if(strstr((const char*)mime_type, "xml")) score+=4; else if(strstr((const char*)mime_type, "atom")) score+=4; } } return score; } static int raptor_rss_parser_register_factory(raptor_parser_factory *factory) { int rc=0; factory->context_length = sizeof(raptor_rss_parser); factory->need_base_uri = 1; factory->init = raptor_rss_parse_init; factory->terminate = raptor_rss_parse_terminate; factory->start = raptor_rss_parse_start; factory->chunk = raptor_rss_parse_chunk; factory->recognise_syntax = raptor_rss_parse_recognise_syntax; rc+= raptor_parser_factory_add_mime_type(factory, "application/rss", 10) != 0; rc+= raptor_parser_factory_add_mime_type(factory, "application/rss+xml", 10) != 0; rc+= raptor_parser_factory_add_mime_type(factory, "text/rss", 8) != 0; rc+= raptor_parser_factory_add_mime_type(factory, "application/xml", 3) != 0; rc+= raptor_parser_factory_add_mime_type(factory, "text/xml", 3) != 0; return rc; } int raptor_init_parser_rss(void) { return !raptor_parser_register_factory("rss-tag-soup", "RSS Tag Soup", &raptor_rss_parser_register_factory); }