Core Code/QHTMLLinkFinder.m

/*
    File:       QHTMLLinkFinder.m
 
    Contains:   Finds links in HTML.
 
    Written by: DTS
 
    Copyright:  Copyright (c) 2011-2013 Apple Inc. All Rights Reserved.
 
    Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc.
                ("Apple") in consideration of your agreement to the following
                terms, and your use, installation, modification or
                redistribution of this Apple software constitutes acceptance of
                these terms.  If you do not agree with these terms, please do
                not use, install, modify or redistribute this Apple software.
 
                In consideration of your agreement to abide by the following
                terms, and subject to these terms, Apple grants you a personal,
                non-exclusive license, under Apple's copyrights in this
                original Apple software (the "Apple Software"), to use,
                reproduce, modify and redistribute the Apple Software, with or
                without modifications, in source and/or binary forms; provided
                that if you redistribute the Apple Software in its entirety and
                without modifications, you must retain this notice and the
                following text and disclaimers in all such redistributions of
                the Apple Software. Neither the name, trademarks, service marks
                or logos of Apple Inc. may be used to endorse or promote
                products derived from the Apple Software without specific prior
                written permission from Apple.  Except as expressly stated in
                this notice, no other rights or licenses, express or implied,
                are granted by Apple herein, including but not limited to any
                patent rights that may be infringed by your derivative works or
                by other works in which the Apple Software may be incorporated.
 
                The Apple Software is provided by Apple on an "AS IS" basis. 
                APPLE MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
                WITHOUT LIMITATION THE IMPLIED WARRANTIES OF NON-INFRINGEMENT,
                MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, REGARDING
                THE APPLE SOFTWARE OR ITS USE AND OPERATION ALONE OR IN
                COMBINATION WITH YOUR PRODUCTS.
 
                IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT,
                INCIDENTAL OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
                TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
                DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ARISING IN ANY WAY
                OUT OF THE USE, REPRODUCTION, MODIFICATION AND/OR DISTRIBUTION
                OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER UNDER THEORY
                OF CONTRACT, TORT (INCLUDING NEGLIGENCE), STRICT LIABILITY OR
                OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF
                SUCH DAMAGE.
 
*/
 
#import "QHTMLLinkFinder.h"
 
#include <libxml/HTMLparser.h>
 
// If we're building with the 10.5 SDK, define our own version of this symbol.
 
#if LIBXML_VERSION < 20703
enum {
    HTML_PARSE_RECOVER  = 1<<0, /* Relaxed parsing */
};
#endif
 
@interface QHTMLLinkFinder ()
 
// Read/write versions of public properties
 
@property (atomic, copy,   readwrite) NSError *         error;
 
// Internal properties
 
@property (atomic, strong, readonly ) NSMutableArray *  mutableLinkURLs;
@property (atomic, strong, readonly ) NSMutableArray *  mutableImageURLs;
 
@end
 
@implementation QHTMLLinkFinder
 
@synthesize data  = _data;
@synthesize URL   = _URL;
@synthesize useRelaxedParsing = _useRelaxedParsing;
 
@synthesize error = _error;
@synthesize mutableLinkURLs  = _mutableLinkURLs;
@synthesize mutableImageURLs = _mutableImageURLs;
 
- (id)initWithData:(NSData *)data fromURL:(NSURL *)url
{
    assert(data != nil);
    assert(url != nil);
    self = [super init];
    if (self != nil) {
        self->_data = [data copy];
        assert(self->_data != nil);
        self->_URL = [url copy];
        assert(self->_URL != nil);
        self->_mutableLinkURLs = [[NSMutableArray alloc] init];
        assert(self->_mutableLinkURLs != nil);
        self->_mutableImageURLs = [[NSMutableArray alloc] init];
        assert(self->_mutableImageURLs != nil);
    }
    return self;
}
 
- (void)dealloc
{
    [self->_mutableLinkURLs release];
    [self->_mutableImageURLs release];
    [self->_error release];
    [self->_URL release];
    [self->_data release];
    [super dealloc];
}
 
- (NSArray *)linkURLs
    // This getter returns a snapshot of the current parser state so that, 
    // if you call it before the parse is done, you don't get a mutable array 
    // that's still being mutated.
{
    return [[self->_mutableLinkURLs copy] autorelease];
}
 
- (NSArray *)imageURLs
    // This getter returns a snapshot of the current parser state so that, 
    // if you call it before the parse is done, you don't get a mutable array 
    // that's still being mutated.
{
    return [[self->_mutableImageURLs copy] autorelease];
}
 
- (void)addURLForCString:(const char *)cStr toArray:(NSMutableArray *)array
    // Adds a URL to the specified array, handling lots of wacky edge cases.
{
    NSString *  str;
    NSURL *     url;
    
    // cStr should be ASCII but, just to be permissive, we'll accept UTF-8. 
    // Handle the case where cStr is not valid UTF-8.
    
    str = @(cStr);
    if (str == nil) {
        assert(NO);
    } else {
    
        // Construct a relativel URL based on our base URL and the string. 
        // This can and does fail on real world systems (curse those users 
        // and their bogus HTML!).
    
        url = [NSURL URLWithString:str relativeToURL:self.URL];
        if (url == nil) {
            NSLog(@"Could not construct URL from '%@' relative to '%@'.", str, self.URL);
            // assert(NO);
        } else {
            [array addObject:url];
                
            // For testing purposes, we add a bogus link every five links.
            
            if (NO) {
                static int sErrorIndex;
                
                url = [NSURL URLWithString:[str stringByAppendingString:@"-bogus"] relativeToURL:self.URL];
                assert(url != nil);
                
                sErrorIndex += 1;
                if ((sErrorIndex % 5) == 0) {
                    [array addObject:url];
                    sErrorIndex += 1;
                }
            }
        }
    }
}
 
static void StartElementSAXFunc(
    void *          ctx,
    const xmlChar * name,
    const xmlChar **attrs
)
    // Called by the HTML parser when we encounter the beginning of a 
    // tag.  This looks for "a" and "img" tags and, within those, looks for 
    // "href" and "src" attributes, respectively.  Upon finding such an attribute 
    // it uses the value of the attribute to construct a URL to add to the relevant 
    // mutable results array.
{
    QHTMLLinkFinder *   obj;
    size_t      attrIndex;
    
    obj = (QHTMLLinkFinder *) ctx;
    assert([obj isKindOfClass:[QHTMLLinkFinder class]]);
    
    // libxml2's HTML parser lower cases tag and attribute names, so 
    // strcmp (rather than strcasecmp) is correct here.
    
    // Tags without attributes are not useful to us.
    
    if (attrs != NULL) {
    
        // Check for the tags we care about and, within them, check for 
        // the attributes we care about.
        
        if ( strcmp( (const char *) name, "a") == 0 ) {
            attrIndex = 0;
            while (attrs[attrIndex] != NULL) {
                if ( strcmp( (const char *) attrs[attrIndex], "href") == 0 ) {
                    [obj addURLForCString:(const char *) attrs[attrIndex + 1] toArray:obj.mutableLinkURLs];
                }
                attrIndex += 2;
            }
        } else if ( strcmp( (const char *) name, "img") == 0 ) {
            attrIndex = 0;
            while (attrs[attrIndex] != NULL) {
                if ( strcmp( (const char *) attrs[attrIndex], "src") == 0 ) {
                    [obj addURLForCString:(const char *) attrs[attrIndex + 1] toArray:obj.mutableImageURLs];
                }
                attrIndex += 2;
            }
        }
    }
}
 
static xmlSAXHandler gSAXHandler = {
    .initialized  = XML_SAX2_MAGIC,
    .startElement = StartElementSAXFunc
};
 
- (void)main
{
    struct _xmlParserCtxt * context;
 
    // Create and run a libxml2 HTML parser.
    
    context = htmlCreatePushParserCtxt(
        &gSAXHandler,
        self,
        NULL,
        0,
        nil,
        XML_CHAR_ENCODING_NONE
    );
    if (context == NULL) {
        self.error = [NSError errorWithDomain:NSXMLParserErrorDomain code:XML_ERR_INTERNAL_ERROR userInfo:nil];
    } else {
        int     err;
        
        // If the client has specified relaxed parsing, set that up in the 
        // libxml2 parser.  First try with HTML_PARSE_RECOVER and, if that 
        // fails, retry without it.
        
        if (self.useRelaxedParsing) {
            err = htmlCtxtUseOptions(context, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
            if (err != 0) {
                (void) htmlCtxtUseOptions(context, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
            }
            // We really don't care if this stuff fails.  err gets overwritten by the call 
            // to htmlParseChunk below.
        }
 
        // htmlParseChunk will only accept an int as the data length. On 64-bit builds, 
        // that's a problem, because [self.data length] is an NSUInteger, which might be greater 
        // than 2 GB.  I could address this properly (by calling htmlParseChunk repeatedly on 
        // 2 GB chunks) but IMO that's not a great solution; if you're parsing data that big, 
        // you really don't want to hold it all in memory even in a 64-bit process.  So, for 
        // the sake of simplicity, I've just added the following assert.
        
        assert( [self.data length] <= (NSUInteger) INT_MAX );
        
        // Parse the data.
        
        err = htmlParseChunk(
            context, 
            [self.data bytes], 
            (int) [self.data length], 
            YES
        );
        
        // Handle the result.
        
        if (err != 0) {
            if (self.error == nil) {
                // The libxml2 HTML parser shares the same errors as the XML parser, so we just 
                // borrow NSXMLParser's error domain.  Keep in mind that you might encounter 
                // errors that aren't explicitly listed in <Foundation/NSXMLParser.h>, such 
                // as XML_HTML_UNKNOWN_TAG.  See xmlParserErrors in <libxml/xmlerror.h> for 
                // the full list.
                self.error = [NSError errorWithDomain:NSXMLParserErrorDomain code:err userInfo:nil];
            }
        }
   
        // Clean up.
        
        htmlFreeParserCtxt(context);
    }
}
 
@end