/***************************************************************************
 *   Copyright (C) 2008 by Jacob Kanev <j_kanev@arcor.de>,                 *
 *   Thomas Fischer <fischer@unix-ag.uni-kl.de>                            *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   You should have received a copy of the GNU General Public License     *
 *   along with this program; if not, write to the                         *
 *   Free Software Foundation, Inc.,                                       *
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
 ***************************************************************************/
#include <tqfile.h>
#include <tqregexp.h>
#include <tqbuffer.h>
#include <tqspinbox.h>

#include <tdelocale.h>
#include <klineedit.h>
#include <tdemessagebox.h>
#include <kurl.h>
#include <kdebug.h>

#include "fileimporterbibtex.h"
#include "encoderxml.h"
#include "settings.h"
#include "webqueryciteseerx.h"

using BibTeX::Value;
using BibTeX::Entry;
using BibTeX::EntryField;

namespace KBibTeX
{

    //_______________________________________________________________________________________________________________
    // Construct widget

    WebQueryCiteSeerXWidget::WebQueryCiteSeerXWidget( TQWidget *parent, const char *name )
            : WebQueryWidget( parent, name )
    {
        init();

        Settings *settings = Settings::self();
        TQString value = settings->getWebQueryDefault( "CiteSeerX" );
        value = value == TQString::null ? "" : value;
        lineEditQuery->setText( value );
        slotTextChanged( value, true );
    }


    //_______________________________________________________________________________________________________________
    // Construct

    WebQueryCiteSeerX::WebQueryCiteSeerX( TQWidget* parent )
            : WebQuery( parent ), m_citeSeerXServer( "citeseerx.ist.psu.edu" )
    {
        m_widget = new WebQueryCiteSeerXWidget( parent );
    }


    //_______________________________________________________________________________________________________________
    // Destroy

    WebQueryCiteSeerX::~WebQueryCiteSeerX()
    {
        delete m_widget;
    }


    //_______________________________________________________________________________________________________________
    // GUI string

    TQString WebQueryCiteSeerX::title()
    {
        return i18n( "CiteSeerX" );
    }


    //_______________________________________________________________________________________________________________
    // GUI info

    TQString WebQueryCiteSeerX::disclaimer()
    {
        return i18n( "About CiteSeerX" );
    }


    //_______________________________________________________________________________________________________________
    // URL for disclaimer

    TQString WebQueryCiteSeerX::disclaimerURL()
    {
        return "http://citeseerx.ist.psu.edu/about/site";
    }


    //_______________________________________________________________________________________________________________
    // return pointer to widget

    WebQueryWidget *WebQueryCiteSeerX::widget()
    {
        return m_widget;
    }


    //_______________________________________________________________________________________________________________
    // user has pressed "Cancel"

    void WebQueryCiteSeerX::cancelQuery()
    {
        m_queryQueue.clear();
    }


    //_______________________________________________________________________________________________________________
    // main function -- collects all queries for one search

    void WebQueryCiteSeerX::query()
    {
        // store CiteSeerX as future default
        WebQuery::query();
        Settings *settings = Settings::self();
        settings->setWebQueryDefault( "CiteSeerX", m_widget->lineEditQuery->text() );

        // read number of desired results from GUI
        m_queryQueue.clear();
        m_desiredHits = m_widget->spinBoxMaxHits->value();
        // one for each entry, and one for each page of 10 links
        setNumStages( m_desiredHits + ( m_desiredHits / 10 + 1 ) );

        // prepare search term
        TQString searchTerm = m_widget->lineEditQuery->text().stripWhiteSpace().replace( '$', "" );
        TQStringList queryWords = TQStringList::split( TQRegExp( "\\s+" ), searchTerm );

        if ( searchTerm.isEmpty() || queryWords.size() == 0 )
        {
            setEndSearch( WebQuery::statusInvalidQuery );
            return;
        }

        // build query from search term
        TQString query;

        for ( uint i = 0; i < queryWords.size(); ++i )
        {
            if ( i ) query += " AND ";

            query += queryWords[i];
        }

        query = query.replace( "%", "%25" ).replace( "+", "%2B" ).replace( " ", "%20" ).replace( "#", "%23" ).replace( "&", "%26" ).replace( "?", "%3F" );

        // schedule jobs
        DataRequest dr;
        dr.url = KURL( TQString( "http://citeseerx.ist.psu.edu/search?q=" ).append( query ).append( "&submit=Search&sort=rel" ) );
        dr.parser = &WebQueryCiteSeerX::parseSummaryPage;
        m_queryQueue.push_back( dr );

        // start job queue
        nextJob();
    }


    //_______________________________________________________________________________________________________________
    // process results from current job

    void WebQueryCiteSeerX::parseSummaryPage( const TQString& data )
    {
        // regexp. for finding paper entries (example: href="/viewdoc/summary;jsessionid=12345ABCD?doi=10.1.1.108.9937")
        TQRegExp paperXpr( "href=\"(/viewdoc/summary[^?]*\\?doi=[^\"]+)\"" );

        // count paper results and schedule single paper URLs

        for ( int p = paperXpr.search( data ); p >= 0; p = paperXpr.search( data, p + paperXpr.matchedLength() ) )
        {
            if ( ++m_receivedHits > m_desiredHits )
                break;

            DataRequest dr;

            dr.url = KURL( TQString( "http://" ) + m_citeSeerXServer + paperXpr.cap( 1 ) );

            dr.parser = &WebQueryCiteSeerX::parsePaperPage;

            m_queryQueue.push_back( dr );
        }

        // if we haven't reached the desired number of hits, schedule the next summary page
        TQRegExp nextSummaryXpr( "<a href=\"([^\"]+)\">Next 10" );

        if ( m_receivedHits < m_desiredHits )
            if ( nextSummaryXpr.search( data ) >= 0 )
            {
                DataRequest dr;
                dr.url = KURL( TQString( "http://" ) + m_citeSeerXServer + nextSummaryXpr.cap( 1 ).replace( "&amp;", "&" ) );
                dr.parser = &WebQueryCiteSeerX::parseSummaryPage;
                m_queryQueue.push_back( dr );
            }
    }


    //_______________________________________________________________________________________________________________
    // process the result of one single paper link

    void WebQueryCiteSeerX::parsePaperPage( const TQString& data )
    {
        // find type and id: @XXX{ YYY
        TQRegExp typeIdXpr( "@(.*)\\{(.*)," );
        typeIdXpr.setMinimal( true );
        typeIdXpr.search( data );
        TQString typeStr = typeIdXpr.cap( 1 );
        TQString id = typeIdXpr.cap( 2 );

        // create entry
        Entry *entry = new BibTeX::Entry( typeIdXpr.cap( 1 ), typeIdXpr.cap( 2 ) );

        // find abstract: <..>Abstract:</..> <..> XXX </..>
        parseForSingleExpression( "<[^<]+>Abstract:</[^<]+>\\s*<[^<]+>([^<]+)</[^<]+>", data, entry, BibTeX::EntryField::ftAbstract );

        // find title: title = {XXX}
        parseForSingleExpression( "title = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftTitle );

        // find author: author = {XXX}
        parseForSingleExpression( "author = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftAuthor );

        // find year: year = {XXX}
        parseForSingleExpression( "year = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftYear );

        // find journal: journal = {XXX}
        parseForSingleExpression( "journal = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftJournal );

        // find pages: pages = {XXX}
        parseForSingleExpression( "pages = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftPages );

        // publish what we've found
        emit foundEntry( entry, false );
    }


    //_______________________________________________________________________________________________________________
    // find single bibtex field in html page and add to entry

    void WebQueryCiteSeerX::parseForSingleExpression( TQString description, const TQString &data, Entry *entry, BibTeX::EntryField::FieldType type )
    {
        // search, and add to entry if found
        TQRegExp xpr( description );

        if ( xpr.search( data ) + 1 )
        {
            EntryField *field = new EntryField( type );
            field->setValue( new Value( xpr.cap( 1 ), false ) );
            entry->addField( field );
        }
    }


    //_______________________________________________________________________________________________________________
    // read data from the job and start the current parser

    void WebQueryCiteSeerX::getData( TDEIO::Job *job )
    {
        // advance GUI progress bar
        enterNextStage();

        if ( job && !job->error() && !m_aborted )
        {

            // read data
            TQBuffer data;
            data.open( IO_WriteOnly );
            data.writeBlock( dynamic_cast<TDEIO::StoredTransferJob*>( job )->data() );
            data.close();
            data.open( IO_ReadOnly );
            TQTextStream ts( &data );
            TQString result = ts.read();
            data.close();

            // hand the read data over to the parser
            ( this->*m_currentParser )( result );
        }

        // proceed
        nextJob();
    }


    //_______________________________________________________________________________________________________________
    // call the next job

    void WebQueryCiteSeerX::nextJob()
    {
        // no more requests: finished
        if ( !m_queryQueue.size() )
        {
            setEndSearch( WebQuery::statusSuccess );
            m_receivedHits = 0;
        }
        // else: take the next request from queue and start it
        else if ( !m_aborted )
        {
            m_currentParser = m_queryQueue.front().parser;
            TDEIO::Job *job = TDEIO::storedGet( m_queryQueue.front().url, FALSE, FALSE );
            m_queryQueue.pop_front();
            connect( job, SIGNAL( result( TDEIO::Job * ) ), this, SLOT( getData( TDEIO::Job * ) ) );
        }
    }

}

#include "webqueryciteseerx.moc"
