Tuesday, 29 July 2014 16:53
beggarly

Xmap, robots.txt and duplicate contents...

An open forum for opinions and general questions

Xmap, robots.txt and duplicate contents...

Postby claudiodestasio » Sat Feb 09, 2013 9:03 am

Hi all, sorry for my bad English!
I'm dedicating some time to optimize my joomla site (www.studiodestasio.it) for the removal of duplicate content trying to prevent the indexing of non SEF url.
Among other things, I reviewed the robots.txt file by entering specific pages that DO NOT want to be indexed.
Everything ok but google webmaster tools report that some url of sitemap generated by Xmap are blocked by robots.txt. It 's just a warning but I'd like to resolve this anomaly, however ...

It would be enough that XMap, when generating the XML page, exclude the url blocked by robots.txt
I do not know almost nothing about PHP but I understand that it would be very easy.

The file that generates the XML map is /components/com_xmap/views/xml/tmpl/default_class.php
I guess that would be sufficient:
1) Opening robots.txt
2) load line by line and if it contains "Disallow:" pick up the right side and store it in an array
3) at this point before "writing" each url in the sitemap make sure it does not contain one of the strings stored in the array of point 2

I do not know if I could do it myself without knowing PHP ...
Is there any that could help me?
claudiodestasio
Fresh Boarder
Fresh Boarder
 
Posts: 2
Joined: Sat Feb 09, 2013 8:55 am

Re: Xmap, robots.txt and duplicate contents...

Postby claudiodestasio » Sun Feb 10, 2013 2:44 pm

I found (maybe) a temporary solution ...
searching the web I found a function that I banally inserted into the file /components/com_xmap/views/xml/tmpl/default_class.php.
Is not an elegant solution because the functions is called for every url to write in the sitemap...
It would seem like a simple thing to implement in future versions of xMAP and it allow to exclude specific links by simply placing them in robots.txt.
How to optimize it?

Code: Select all
<?php
/**
* @version         $Id$
* @copyright        Copyright (C) 2005 - 2009 Joomla! Vargas. All rights reserved.
* @license        GNU General Public License version 2 or later; see LICENSE.txt
* @author        Guillermo Vargas (guille@vargas.co.cr)
*/
// No direct access
defined('_JEXEC') or die;

require_once(JPATH_COMPONENT . '/displayer.php');

class XmapXmlDisplayer extends XmapDisplayer
{

    /**
     *
     * @var array  Stores the list of links that have been already included in
     *             the sitemap to avoid duplicated items
     */
    var $_links;

    /**
     *
     * @var string
     */
    var $view = 'xml';

    protected $showTitle = false;
    protected $showExcluded = false;

    /**
     *
     * @var int Indicates if this is a google news sitemap or not
     */
    var $isNews = 0;

    function __construct($config, $sitemap)
    {
        parent::__construct($config, $sitemap);
        $this->uids = array();

        $this->defaultLanguage = strtolower(JFactory::getLanguage()->getTag());
        if (preg_match('/^([a-z]+)-.*/',$this->defaultLanguage,$matches) && !in_array($this->defaultLanguage, array(' zh-cn',' zh-tw')) ) {
            $this->defaultLanguage = $matches[1];
        }

        $this->showTitle = JRequest::getBool('filter_showtitle', 0);
        $this->showExcluded = JRequest::getBool('filter_showexcluded', 0);

        $db = JFactory::getDbo();
        $this->nullDate = $db->getNullDate();
    }

    /**
     * Prints an XML node for the sitemap
     *
     * @param stdclass $node
     */
    function printNode($node)
    {
        $node->isExcluded = false;
        if ($this->isExcluded($node->id,$node->uid)) {
            if (!$this->showExcluded || !$this->canEdit) {
                return false;
            }
            $node->isExcluded = true;
        }

        if ($this->isNews && (!isset($node->newsItem) || !$node->newsItem)) {
            return true;
        }

        // Get the item's URL
        $link = JRoute::_($node->link, true, @$node->secure==0? -1: $node->secure);

        if (!isset($node->browserNav))
            $node->browserNav = 0;

        if ($node->browserNav != 3   // ignore "no link"
                && empty($this->_links[$link])) { // ignore links that have been added already
            $this->count++;
            $this->_links[$link] = 1;

            if (!isset($node->priority))
                $node->priority = "0.5";

            if (!isset($node->changefreq))
                $node->changefreq = 'daily';

            // Get the chancefrequency and priority for this item
            $changefreq = $this->getProperty('changefreq', $node->changefreq, $node->id, 'xml', $node->uid);
            $priority = $this->getProperty('priority', $node->priority, $node->id, 'xml', $node->uid);

// this is only for me
$link=str_replace('/forum/home/','/forum/',$link);

if (robots_allowed($link,"*")) {
            echo '<url>' . "\n";
            echo '<loc>', $link, '</loc>' . "\n";
            if ($this->canEdit) {
                if ($this->showTitle) {
                    echo '<title><![CDATA['.$node->name.']]></title>' . "\n";
                }
                if ($this->showExcluded) {
                    echo '<rowclass>',($node->isExcluded? 'excluded':''),'</rowclass>';
                }
                echo '<uid>', $node->uid, '</uid>' . "\n";
                echo '<itemid>', $node->id, '</itemid>' . "\n";
            }
            $modified = (isset($node->modified) && $node->modified != FALSE && $node->modified != $this->nullDate && $node->modified != -1) ? $node->modified : NULL;
            if (!$modified && $this->isNews) {
                $modified = time();
            }
            if ($modified && !is_numeric($modified)){
                $date =  new JDate($modified);
                $modified = $date->toUnix();
            }
            if ($modified) {
                $modified = gmdate('Y-m-d\TH:i:s\Z', $modified);
            }

            // If this is not a news sitemap
            if (!$this->isNews) {
                if ($modified){
                    echo '<lastmod>', $modified, '</lastmod>' . "\n";
                }
                echo '<changefreq>', $changefreq, '</changefreq>' . "\n";
                echo '<priority>', $priority, '</priority>' . "\n";
            } else {
                if (isset($node->keywords)) {
                    $keywords = htmlspecialchars($node->keywords);
                } else {
                    $keywords = '';
                }

                if (!isset($node->language) || $node->language == '*') {
                    $node->language = $this->defaultLanguage;
                }

                echo "<news:news>\n";
                echo '<news:publication>'."\n";
                echo '  <news:name>'.(htmlspecialchars($this->sitemap->params->get('news_publication_name'))).'</news:name>'."\n";
                echo '  <news:language>'.$node->language.'</news:language>'."\n";
                echo '</news:publication>'."\n";
                echo '<news:publication_date>', $modified, '</news:publication_date>' . "\n";
                echo '<news:title><![CDATA['.$node->name.']]></news:title>' . "\n";
                if ($keywords) {
                    echo '<news:keywords>', $keywords, '</news:keywords>' . "\n";
                }
                echo "</news:news>\n";
            }
            echo '</url>', "\n";
            }
        } else {
            return empty($this->_links[$link]);
        }
        return true;
       
    }

    /**
     *
     * @param string $property The property that is needed
     * @param string $value The default value if the property is not found
     * @param int $Itemid   The menu item id
     * @param string $view  (xml / html)
     * @param int $uid      Unique id of the element on the sitemap
     *                      (the id asigned by the extension)
     * @return string
     */
     
     
    function getProperty($property, $value, $Itemid, $view, $uid)
    {
        if (isset($this->jview->sitemapItems[$view][$Itemid][$uid][$property])) {
            return $this->jview->sitemapItems[$view][$Itemid][$uid][$property];
        }
        return $value;
    }

    /**
     * Called on every level change
     *
     * @param int $level
     * @return boolean
     */
    function changeLevel($level)
    {
        return true;
    }

    /**
     * Function called before displaying the menu
     *
     * @param stdclass $menu The menu node item
     * @return boolean
     */
    function startMenu($menu)
    {
        return true;
    }

    /**
     * Function called after displaying the menu
     *
     * @param stdclass $menu The menu node item
     * @return boolean
     */
    function endMenu($menu)
    {
        return true;
    }
   
}

function robots_allowed($url, $useragent=false)
  {
    // parse url to retrieve host and path
    $parsed = parse_url($url);
    $agents = array(preg_quote('*'));
    if($useragent) $agents[] = preg_quote($useragent);
    $agents = implode('|', $agents);

    // location of robots.txt file
    $robotstxt = @file("http://www.studiodestasio.it/robots.txt");

    // if there isn't a robots, then we're allowed in
    if(empty($robotstxt)) return true;

    $rules = array();
    $ruleApplies = false;
    foreach($robotstxt as $line) {
       // skip blank lines
      if(!$line = trim($line)) continue;

      // following rules only apply if User-agent matches $useragent or '*'
      if(preg_match('/^\s*User-agent: (.*)/i', $line, $match)) {
        $ruleApplies = preg_match("/($agents)/i", $match[1]);
      }
      if($ruleApplies && preg_match('/^\s*Disallow:(.*)/i', $line, $regs)) {
        // an empty rule implies full access - no further tests required
        if(!$regs[1]) return true;
        // add rules that apply to array for testing
        $rules[] = preg_quote(trim($regs[1]), '/');
      }
    }

    foreach($rules as $rule) {
      // check if page is disallowed to us
      if(preg_match("/^$rule/", $parsed['path'])) return false;
    }

    // page is not disallowed
    return true;
  }
claudiodestasio
Fresh Boarder
Fresh Boarder
 
Posts: 2
Joined: Sat Feb 09, 2013 8:55 am


Return to General



Who is online

Users browsing this forum: No registered users and 4 guests