<?xml version='1.0' encoding='UTF-8'?><?xml-stylesheet href="http://www.blogger.com/styles/atom.css" type="text/css"?><feed xmlns='http://www.w3.org/2005/Atom' xmlns:openSearch='http://a9.com/-/spec/opensearchrss/1.0/' xmlns:georss='http://www.georss.org/georss' xmlns:gd='http://schemas.google.com/g/2005' xmlns:thr='http://purl.org/syndication/thread/1.0'><id>tag:blogger.com,1999:blog-32189452</id><updated>2012-02-16T11:27:34.322Z</updated><category term='glexec'/><category term='SL4'/><category term='Christmas Holidays'/><category term='YAIM'/><category term='MCE Errors'/><category term='DNS'/><category term='CRLs'/><category term='CHEP07'/><category term='CASTEP'/><category term='SL5'/><category term='ssc'/><category term='torque'/><category term='CE'/><category term='UI'/><category term='Users'/><category term='ATLAS'/><category term='cream'/><category term='EGEE'/><category term='cfengine'/><category term='SGE'/><category term='Australia'/><category term='WMS'/><category term='t2review'/><category term='yum'/><category term='LCMAPS'/><category term='SPEC'/><category term='MC production'/><category term='SLC'/><category term='ALICE'/><category term='Outage'/><category term='APEL'/><category term='rfio'/><category term='AliEn'/><category term='GridPP19'/><category term='Apache'/><category term='SSP'/><category term='WLCG'/><category term='MPI'/><category term='Ganglia'/><category term='Transfer Tests'/><category term='splunk'/><category term='Holidays'/><category term='UKI'/><category term='IPv6'/><category term='cic portal'/><category term='DPM'/><category term='gatekeeper'/><category term='ScotGRID-Edinburgh'/><category term='Accounting'/><category term='java'/><category term='mysql'/><category term='voms'/><category term='downtime'/><category term='vmware'/><category term='arc'/><category term='TPM'/><category term='camont'/><category term='scotgrid-gla'/><category term='Perl'/><category term='totalep'/><category term='glasgow'/><category term='gridftp'/><category term='UKI-SCOTGRID-ECDF'/><category term='SRB'/><category term='Resource Broker'/><category term='milk'/><category term='NAT'/><category term='Air Con'/><category term='people'/><category term='UKI-SCOTGRID-EDINBURGH'/><category term='disk035'/><category term='AstroGrid'/><category term='Upgrade'/><category term='Second Cream CE'/><category term='YPF'/><category term='VPAC'/><category term='grid-mapfile'/><category term='RGMA'/><category term='GRAM'/><category term='GOCDB'/><category term='LHCb'/><category term='GridICE'/><category term='OPENMPI'/><category term='AUSGRID'/><category term='snmp'/><category term='vmem'/><category term='ganga'/><category term='UKI-SCOTGRID-GLASGOW'/><category term='Blog'/><category term='Ballarat'/><category term='lightpath'/><category term='wiki'/><category term='support'/><category term='BDII'/><category term='Panda'/><category term='svr031'/><category term='gqsub'/><category term='nanocmos'/><category term='avoiding dairy puns'/><category term='SL5.4'/><category term='rpm'/><category term='ECDF'/><category term='gaussian'/><category term='Data Management'/><category term='LCG CE'/><category term='VM'/><category term='storage benchmarks'/><category term='zeus'/><category term='analysis'/><category term='python'/><category term='DIRAC'/><category term='FTS'/><category term='Reliabilty'/><category term='FC'/><category term='glite-UI'/><category term='nfs'/><category term='UKI-SCOTGRID-DURHAM'/><category term='Storage'/><category term='FDR'/><category term='NGS'/><category term='power outage'/><category term='Network'/><category term='cvmfs'/><category term='dCache'/><category term='Lustre'/><category term='NA4'/><category term='QOS'/><category term='Publicity'/><category term='maui'/><category term='SuperNEMO'/><category term='MonAMI'/><category term='biomed'/><category term='awesome'/><category term='SCAS'/><category term='SAM'/><category term='nagios'/><category term='monitoring'/><category term='LFC'/><category term='SRM'/><category term='pheno'/><category term='Steve Lloyd tests'/><category term='lumerical'/><category term='Uppsala'/><category term='Water Cooling'/><category term='iperf'/><category term='X509'/><category term='Fabric Management'/><category term='VO'/><category term='blah'/><category term='job wrapper'/><category term='HEPiX'/><category term='SSD'/><category term='VeRSI'/><category term='ScotLUG'/><category term='SAN'/><category term='Rant'/><category term='GridPP'/><category term='Pond'/><category term='Job Efficiency'/><category term='SVN'/><category term='LUV'/><category term='GPFS'/><category term='outreach'/><title type='text'>ScotGrid</title><subtitle type='html'>This is the blog for the ScotGrid distributed Tier-2 at the Univeristies of Durham, Edinburgh and Glasgow.  
&lt;br&gt;
ScotGrid is part of the GridPP project, the EGI project and WLCG.</subtitle><link rel='http://schemas.google.com/g/2005#feed' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/posts/default'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default?max-results=100'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/'/><link rel='hub' href='http://pubsubhubbub.appspot.com/'/><link rel='next' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default?start-index=101&amp;max-results=100'/><author><name>Graeme Stewart</name><uri>http://www.blogger.com/profile/04113191724360870254</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='21' height='32' src='http://www.physics.gla.ac.uk/~graeme/graeme.jpg'/></author><generator version='7.00' uri='http://www.blogger.com'>Blogger</generator><openSearch:totalResults>496</openSearch:totalResults><openSearch:startIndex>1</openSearch:startIndex><openSearch:itemsPerPage>100</openSearch:itemsPerPage><entry><id>tag:blogger.com,1999:blog-32189452.post-2914177616067136079</id><published>2011-12-21T10:00:00.007Z</published><updated>2011-12-21T22:11:09.372Z</updated><category scheme='http://www.blogger.com/atom/ns#' term='torque'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><title type='text'>Batch system juggling</title><content type='html'>We've been a bit quiet up here recently.  This is normally a sign of either nothing interesting happening, or entirely too many interesting things happening.  Opinions on that may divide, but I think it's closer to the latter...&lt;br /&gt;&lt;br /&gt;One of the recent bits of fun that occurred was with our batch server.  This story actually starts a long time ago; about this time last year.  At that point, we started to get intermittent memory errors from the Torque server - corrected by ECC - but that's generally a sign that the RAM's about to fail.  Given that the batch server is single point of failure for a site, that's not a good thing.&lt;br /&gt;&lt;br /&gt;So I spent some time preparing a spare box, and being ready to move the batch system over, in case it failed over the winter break. Which, after all that prep, it didn't, and the errors stopped.  On the expectation that the current hardware was nearing end of life, we ordered a new box early this year, and have had it sitting in a machine room for a while.&lt;br /&gt;&lt;br /&gt;Unfortunately we didn't get time to have it running a tested batch system until our power supply started to ... well, insert colourful metaphor here, describing the 8 months where we were affected by lack of power.&lt;br /&gt;&lt;br /&gt;Power got to stable supply in September, and so to catch up on things.  One of the things we got around to was software versions.  Whilst we didn't intent to update the Torque version, and managed to avoid it for a bit, the gLite developers eventually managed to sneak the update past us as part of an ordinary gLite update.  Strictly, this didn't affect the batch server, just all the CE's, making them incompatible with the previous version of Torque.&lt;br /&gt;&lt;br /&gt;Whilst a clever manoeuvre, reminiscent of Odysseus' Pony, it did leave us with a conundrum of either reverting the gLite update, or running forward with it.  Neither were options of good character, but running forward did have some actual documentation; hence it was full speed ahead.&lt;br /&gt;&lt;br /&gt;Which worked out well enough.  The Torque 2.5.7 packages were set to use Munge, so getting that installed and tested as a first step helped it go smoothly.  To preserve compatability in file locations, we used /etc/sysconfig/pbs_mom to put the pbs working directories in the same place as previously - meaning we didn't have to reconfigure any other tools.&lt;br /&gt;&lt;br /&gt;What didn't go so smoothly was the memory leak in the server.&lt;br /&gt;&lt;br /&gt;Which gave it a runtime of around 36 hours between crashes.  Actually, not even crashes - we found that the pbs_server process hit either&lt;br /&gt;&lt;br /&gt;&lt;blockquote&gt;&lt;br /&gt;12/05/2011 10:19:12;0080;PBS_Server;Req;req_reject;Reject reply code=15012(PBS_Server System error: No child processes MSG=could not unmunge credentials), aux=0, type=AlternateUserAuthentication, from tomcat@svr021.gla.scotgrid.ac.uk&lt;br /&gt;&lt;/blockquote&gt;&lt;br /&gt;or&lt;br /&gt;&lt;blockquote&gt; &lt;br /&gt;10/29/2011 18:11:24;0001;PBS_Server;Svr;PBS_Server;LOG_ERROR::Cannot allocate memory (12) in send_job, fork failed&lt;br /&gt;&lt;/blockquote&gt;&lt;br /&gt;&lt;br /&gt;and then sat around moaning.  Had it crashed hard, then the auto-restart would have caught it.  Ho, hum, one for the Fast Fail philosophy there.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;By this point, my proof reader is pointing out that I started off talking hardware, and now talking software.  Punchline is that the new server that we never got a chance to use has a lot more RAM than the old server.  Therefore we wanted to move the server from the old hardware to the new, to give it a lot more RAM space.  That won't fix the memory leak, but will mitigate the problem a bit.&lt;br /&gt;&lt;br /&gt;Conventionally, this would involve draining the cluster, repositioning the CE's and then starting up everything again.  Had we done that, this blog post would be over now.&lt;br /&gt;&lt;br /&gt;Instead, we did a rolling update.  This let us move things over without having to do a full drain.  The biggest problem with a full drain is that, while most of the jobs finish within a shorter period of time that then limit, there are always some that take the full duration.  This leaves us with an empty cluster, doing nothing, for 24 hours or so, wainting on a couple of jobs to finish.&lt;br /&gt;&lt;br /&gt;So, instead, by moving things in small batches, then we can keep most of the nodes working, and thus get more work out of things.  Step zero is to disable cfengine, otherwise it tends to try and 'fix' things part way through.&lt;br /&gt;&lt;br /&gt;Step one is to drain a CE, which we did over a weekend, and a small number of nodes, which we put offline on the Sunday morning.&lt;br /&gt;&lt;br /&gt;Come Monday, I set up and tested basic operations with the new batch server, and then moved the freed up nodes across to it.  Once those were tested (which shook out a couple of issues about versioning of some libs), point the CE at the new batch server, and then run a test job though it.  (It turns out that Atlas are fast enough to sneak some pilots through a 2 minute window for a test job.  However, only a few, so they actually functioned as effective tests, without compromising the site if they failed).&lt;br /&gt;&lt;br /&gt;After that, it's time to offline another CE, and then some more nodes, and start moving nodes over when they were empty.  In the end I scripted this:&lt;br /&gt;&lt;br /&gt;&lt;blockquote&gt;&lt;br /&gt;#!/bin/sh&lt;br /&gt;&lt;br /&gt;NODE=$1&lt;br /&gt;RUNNING=$(qstat -n -1 | grep $NODE | wc --lines)&lt;br /&gt;&lt;br /&gt;if [ "x${RUNNING}" != "x0" ]&lt;br /&gt;        then&lt;br /&gt;        echo $NODE: Still $RUNNING jobs going, skipping&lt;br /&gt;        exit 2&lt;br /&gt;fi&lt;br /&gt;&lt;br /&gt;CORES=$(qmgr -c "print node ${NODE}" | grep "np = " | cut -d= -f2)&lt;br /&gt;&lt;br /&gt;FROM=svr666&lt;br /&gt;TO=svr999&lt;br /&gt;&lt;br /&gt;echo $NODE: Moving to ${TO} with ${CORES} cores&lt;br /&gt;&lt;br /&gt;ssh ${TO} "~/addNode.sh ${NODE} ${CORES}"&lt;br /&gt;&lt;br /&gt;ssh ${NODE} "service pbs_mom stop"&lt;br /&gt;scp config.mom.svr666 ${NODE}:/var/spool/pbs/mom_priv/config&lt;br /&gt;ssh ${NODE} "service pbs_mom start"&lt;br /&gt;&lt;br /&gt;ssh ${FROM} "~/deleteNode.sh ${NODE}"&lt;br /&gt;&lt;/blockquote&gt;&lt;br /&gt;&lt;br /&gt;In theory one can run qmgr remotely, rather than ssh-ing to the batch servers and running a script.  In practice, with the different versions of Torque, I couldn't get that to work.  Note the automation of the mom config switch as well; and that this script checks that the node is empty.&lt;br /&gt;&lt;br /&gt;This reduced the gradual move of nodes to a process of croning the script, and offlining nodes occasionally.&lt;br /&gt;&lt;br /&gt;The net result was that we were operating at around 80% capacity for 48 hours, and it was all rather uneventful - in a good way.  The final step was to update cfengine config and re-enable it.&lt;br /&gt;&lt;br /&gt;One of the plus points of the above script is that it should be simple to adapt to two distinct batch systems; which means if we end up moving away from Torque, we should be able to do that without downtime too.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-2914177616067136079?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/2914177616067136079/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=2914177616067136079' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/2914177616067136079'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/2914177616067136079'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2011/12/batch-system-juggling.html' title='Batch system juggling'/><author><name>Stuart Purdie</name><uri>http://www.blogger.com/profile/08473287949581285669</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-8232998047328952843</id><published>2011-09-23T14:19:00.003+01:00</published><updated>2011-09-23T14:21:27.952+01:00</updated><title type='text'>Leaving Lyon</title><content type='html'>The EGI Tech Forum is winding down, with only a few talks remaining. It's been a great meeting, with a wide range of talks on all areas of Grid Computing. Lots to think about and new ideas to try out!&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-8232998047328952843?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/8232998047328952843/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=8232998047328952843' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/8232998047328952843'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/8232998047328952843'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2011/09/leaving-lyon.html' title='Leaving Lyon'/><author><name>David Crooks</name><uri>http://www.blogger.com/profile/07412551479798045933</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-4353000119367915586</id><published>2011-09-21T13:01:00.003+01:00</published><updated>2011-09-21T13:02:40.063+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-EDINBURGH'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-DURHAM'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><category scheme='http://www.blogger.com/atom/ns#' term='glasgow'/><title type='text'>Scotgrid goes South</title><content type='html'>Last we week attended the bi-annual GridPP Collaboration meeting.&lt;br /&gt; The venue this time was CERN itself and the meeting was, as ever, incredibly useful.&lt;br /&gt;&lt;br /&gt;We were lucky enough to have presentations from the Experiments, the LHC, EGI and the WLCG community as well as presentations from across the UK collaboration.&lt;br /&gt;&lt;br /&gt;A full programme of the meeting is available here:&lt;br /&gt;&lt;br /&gt;http://www.gridpp.ac.uk/gridpp27/&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://4.bp.blogspot.com/-TndsMlJ0ULs/TnnR4vDn7xI/AAAAAAAAAJw/DZghcVEcW5o/s1600/DSC01427.JPG" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="205" src="http://4.bp.blogspot.com/-TndsMlJ0ULs/TnnR4vDn7xI/AAAAAAAAAJw/DZghcVEcW5o/s320/DSC01427.JPG" width="320" /&gt;&lt;/a&gt;&lt;/div&gt;Above is a picture of our own Dr Crooks presenting on the Glasgow Security Model&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-4353000119367915586?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/4353000119367915586/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=4353000119367915586' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4353000119367915586'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4353000119367915586'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2011/09/scotgrid-goes-south.html' title='Scotgrid goes South'/><author><name>Mark Mitchell</name><uri>http://www.blogger.com/profile/11230312300719381277</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='18' height='32' src='http://4.bp.blogspot.com/_1-iOiS0Iq18/SmyuhPjhJFI/AAAAAAAAAAM/4V_tW8Lquz0/S220/CNV00029.JPG'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://4.bp.blogspot.com/-TndsMlJ0ULs/TnnR4vDn7xI/AAAAAAAAAJw/DZghcVEcW5o/s72-c/DSC01427.JPG' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-8852198726154263613</id><published>2011-09-19T07:34:00.004+01:00</published><updated>2011-09-19T07:46:40.282+01:00</updated><title type='text'>EGI Tech Forum 2011</title><content type='html'>&lt;a href="http://1.bp.blogspot.com/-5nlyYmRSRn4/TnblLQdHzAI/AAAAAAAAAAQ/bRJNnVE4z_k/s1600/IMG_0040%2B%25281%2529.jpg" onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 320px; height: 240px;" src="http://1.bp.blogspot.com/-5nlyYmRSRn4/TnblLQdHzAI/AAAAAAAAAAQ/bRJNnVE4z_k/s320/IMG_0040%2B%25281%2529.jpg" border="0" alt="" id="BLOGGER_PHOTO_ID_5653958363768671234" /&gt;&lt;/a&gt;&lt;i&gt;Bonjour Lyon!&lt;/i&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;After last week's GridPP 27 meeting in CERN, this week we are in Lyon for the 2011 EGI Tech Forum, running from Monday until Friday this week. You can follow the Forum online using some of the links &lt;a href="http://tf2011.egi.eu/media_room/index.html"&gt;here&lt;/a&gt;.&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;More later - time now to find some coffee before the first session...&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-8852198726154263613?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/8852198726154263613/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=8852198726154263613' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/8852198726154263613'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/8852198726154263613'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2011/09/egi-tech-forum-2011.html' title='EGI Tech Forum 2011'/><author><name>David Crooks</name><uri>http://www.blogger.com/profile/07412551479798045933</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://1.bp.blogspot.com/-5nlyYmRSRn4/TnblLQdHzAI/AAAAAAAAAAQ/bRJNnVE4z_k/s72-c/IMG_0040%2B%25281%2529.jpg' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-6771085777278509063</id><published>2011-08-25T17:59:00.000+01:00</published><updated>2011-08-25T18:00:35.817+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='Network'/><category scheme='http://www.blogger.com/atom/ns#' term='QOS'/><title type='text'>Busy Disks</title><content type='html'>After checking a test 10 gig Disk Server deployment we uncovered an interesting pattern in storage network activity and how our 10 Gig switch copes with multiply connections at 10 Gigabit. The captures below were taken over a 5 minute window of operation and show just how bursty the traffic patterns from these devices can be.&lt;br /&gt;&lt;br /&gt;The graphs show all interfaces on our Dell 8024F and the measurement window is in Mbps. The order is top to bottom with the initial capture at the top.&lt;br /&gt;&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://1.bp.blogspot.com/-xPowx4ifWWQ/TlZ-Lva8iUI/AAAAAAAAAJM/ilMCgDFTWJw/s1600/1.jpg" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="168" src="http://1.bp.blogspot.com/-xPowx4ifWWQ/TlZ-Lva8iUI/AAAAAAAAAJM/ilMCgDFTWJw/s320/1.jpg" width="320" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://1.bp.blogspot.com/-Yb_2vCxlhLI/TlZ-La9Y9ZI/AAAAAAAAAJI/udYknBG38YY/s1600/2.jpg" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="181" src="http://1.bp.blogspot.com/-Yb_2vCxlhLI/TlZ-La9Y9ZI/AAAAAAAAAJI/udYknBG38YY/s320/2.jpg" width="320" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://1.bp.blogspot.com/-enQ3GGskd3o/TlZ-LC3iycI/AAAAAAAAAJE/ZgA5KGYh8tg/s1600/3.jpg" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="194" src="http://1.bp.blogspot.com/-enQ3GGskd3o/TlZ-LC3iycI/AAAAAAAAAJE/ZgA5KGYh8tg/s320/3.jpg" width="320" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/-RDuXHPwwr4E/TlZ-Kw35ldI/AAAAAAAAAJA/9VO70D15dAU/s1600/4.jpg" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="206" src="http://2.bp.blogspot.com/-RDuXHPwwr4E/TlZ-Kw35ldI/AAAAAAAAAJA/9VO70D15dAU/s320/4.jpg" width="320" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://1.bp.blogspot.com/-wBKH1Vm_CEQ/TlZ-Ki2kQDI/AAAAAAAAAI8/4mSwrtwu8OA/s1600/5.jpg" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="176" src="http://1.bp.blogspot.com/-wBKH1Vm_CEQ/TlZ-Ki2kQDI/AAAAAAAAAI8/4mSwrtwu8OA/s320/5.jpg" width="320" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/-xn1_Qm6XRGk/TlZ-KbKxLiI/AAAAAAAAAI4/3Bh9FZbvxhU/s1600/6.jpg" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="196" src="http://3.bp.blogspot.com/-xn1_Qm6XRGk/TlZ-KbKxLiI/AAAAAAAAAI4/3Bh9FZbvxhU/s320/6.jpg" width="320" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;While the Disk servers have been hammering away the round trip time intra room has been on average 0.40 msec between devices as the CPU on the core Dell seems more than happy to be handle these loads as its utilisation is approximately 20% presently.&lt;br /&gt;&lt;br /&gt;We are planning to enable QOS metrics on disk server traffic shortly to test the response times on QOS and Non-QOS disk servers.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-6771085777278509063?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/6771085777278509063/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=6771085777278509063' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/6771085777278509063'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/6771085777278509063'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2011/08/busy-disks.html' title='Busy Disks'/><author><name>Mark Mitchell</name><uri>http://www.blogger.com/profile/11230312300719381277</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='18' height='32' src='http://4.bp.blogspot.com/_1-iOiS0Iq18/SmyuhPjhJFI/AAAAAAAAAAM/4V_tW8Lquz0/S220/CNV00029.JPG'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://1.bp.blogspot.com/-xPowx4ifWWQ/TlZ-Lva8iUI/AAAAAAAAAJM/ilMCgDFTWJw/s72-c/1.jpg' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-5086110591951769478</id><published>2011-08-25T17:22:00.000+01:00</published><updated>2011-08-25T17:23:09.140+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='Network'/><category scheme='http://www.blogger.com/atom/ns#' term='HEPiX'/><category scheme='http://www.blogger.com/atom/ns#' term='IPv6'/><title type='text'>News Flash from ScotGrid Labs</title><content type='html'>In my last post, we investigating deployments of IPv6 on the test Cluster, the 1st one of which was using SLAAC to assign addressing to hosts. Interestingly enough it worked, first time out the tin.&lt;br /&gt;&lt;br /&gt;An IPv6 Traceroute from the web is shown below:&lt;br /&gt;&lt;br /&gt;traceroute to 2001:630:40:ef0:230:48ff:fe5a:4b7 (2001:630:40:ef0:230:48ff:fe5a:4b7), 30 hops max, 40 byte packets&lt;br /&gt;&amp;nbsp;1&amp;nbsp; 2001:1af8:4200:b000::1 (2001:1af8:4200:b000::1)&amp;nbsp; 1.600 ms&amp;nbsp; 1.813 ms&amp;nbsp; 1.882 ms&lt;br /&gt;&amp;nbsp;2&amp;nbsp; 2001:1af8:4100::5 (2001:1af8:4100::5)&amp;nbsp; 1.320 ms&amp;nbsp; 1.392 ms&amp;nbsp; 1.465 ms&lt;br /&gt;&amp;nbsp;3&amp;nbsp; be11.crs.evo.leaseweb.net (2001:1af8::9)&amp;nbsp; 2.587 ms&amp;nbsp; 2.631 ms&amp;nbsp; 2.619 ms&lt;br /&gt;&amp;nbsp;4&amp;nbsp; linx-gw1.ja.net (2001:7f8:4::312:1)&amp;nbsp; 8.475 ms&amp;nbsp; 8.466 ms&amp;nbsp; 8.453 ms&lt;br /&gt;&amp;nbsp;5&amp;nbsp; ae1.lond-sbr4.ja.net (2001:630:0:10::151)&amp;nbsp; 78.338 ms&amp;nbsp; 78.388 ms&amp;nbsp; 78.376 ms&lt;br /&gt;&amp;nbsp;6&amp;nbsp; 2001:630:0:10::109 (2001:630:0:10::109)&amp;nbsp; 9.900 ms&amp;nbsp; 9.479 ms&amp;nbsp; 9.446 ms&lt;br /&gt;&amp;nbsp;7&amp;nbsp; so-5-0-0.warr-sbr1.ja.net (2001:630:0:10::36)&amp;nbsp; 13.320 ms&amp;nbsp; 13.196 ms&amp;nbsp; 13.317 ms&lt;br /&gt;&amp;nbsp;8&amp;nbsp; 2001:630:0:10::296 (2001:630:0:10::296)&amp;nbsp; 18.705 ms&amp;nbsp; 18.542 ms&amp;nbsp; 18.793 ms&lt;br /&gt;&amp;nbsp;9&amp;nbsp; clydenet.glas-sbr1.ja.net (2001:630:0:8044::206)&amp;nbsp; 18.947 ms&amp;nbsp; 18.931 ms&amp;nbsp; 18.948 ms&lt;br /&gt;10&amp;nbsp; 2001:630:42:0:3e::9a (2001:630:42:0:3e::9a)&amp;nbsp; 19.434 ms !X&amp;nbsp; 18.214 ms !X&amp;nbsp; 17.682 ms !X&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;The next phase of testing will be to enable a webserver to speak in both IPv4 and IPv6 using this access mechanism and then onto a Grid services .&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;I will post up a more detailed explanation of the mechanisms used for this soon.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-5086110591951769478?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/5086110591951769478/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=5086110591951769478' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/5086110591951769478'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/5086110591951769478'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2011/08/news-flash-from-scotgrid-labs.html' title='News Flash from ScotGrid Labs'/><author><name>Mark Mitchell</name><uri>http://www.blogger.com/profile/11230312300719381277</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='18' height='32' src='http://4.bp.blogspot.com/_1-iOiS0Iq18/SmyuhPjhJFI/AAAAAAAAAAM/4V_tW8Lquz0/S220/CNV00029.JPG'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-7671219872673079268</id><published>2011-08-23T16:02:00.000+01:00</published><updated>2011-08-23T16:03:13.185+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='Network'/><category scheme='http://www.blogger.com/atom/ns#' term='HEPiX'/><title type='text'>Two Stacks are better than one</title><content type='html'>Leading on from the last post, we have also re-introduced a new test cluster. This infrastructure is housed within the same rack as our old worker nodes&amp;nbsp; but is completely independent of the production cluster. Supporting a Dell 8024F are 5 servers and a Dell 5000 series switch which are connected via an independent 1 gigabit fibre connection to the University's network.&lt;br /&gt;&lt;br /&gt;The purpose of this cluster is to test IPv4/IPv6 dual stack connectivity for grid Services, the testing of switch based security mechanisms and SL6 NAT testing without fear of impacting the real cluster.&lt;br /&gt;&lt;br /&gt;The IPv6 connectivity model testing will be in multiple phases which include:&lt;br /&gt;&lt;br /&gt;* &lt;a href="http://en.wikipedia.org/wiki/IPv6#Stateless_address_autoconfiguration"&gt;SLAAC&lt;/a&gt; &lt;br /&gt;* &lt;a href="http://en.wikipedia.org/wiki/IPv6#Configured_and_automated_tunneling_.286in4.29"&gt;IPv6 to IPv4 tunneling&lt;/a&gt;&lt;br /&gt;* IPv6 Routing&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;This framework is designed to comply with the &lt;a href="https://w3.hepix.org/ipv6-bis/doku.php?id=ipv6:testbed"&gt;HEPIX IPv6 Project &lt;/a&gt;and to look at the possible connection models required by Tier-2s to utilise IPv6. Additionally, we will be testing a wide variety of Grid enabled applications and associated systems such as Nagios to investigate potential issues within a dual stack deployment.&lt;br /&gt;&lt;br /&gt;More on this soon.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-7671219872673079268?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/7671219872673079268/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=7671219872673079268' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/7671219872673079268'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/7671219872673079268'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2011/08/two-stacks-are-better-than-one.html' title='Two Stacks are better than one'/><author><name>Mark Mitchell</name><uri>http://www.blogger.com/profile/11230312300719381277</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='18' height='32' src='http://4.bp.blogspot.com/_1-iOiS0Iq18/SmyuhPjhJFI/AAAAAAAAAAM/4V_tW8Lquz0/S220/CNV00029.JPG'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-134470251920865229</id><published>2011-08-23T15:40:00.003+01:00</published><updated>2011-08-23T16:04:23.032+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='Network'/><title type='text'>Night of the Return of the Living Worker Nodes</title><content type='html'>As Glasgow is currently being used as one of the sets for World War Z, we thought it only apt that we too resurrect the dead and get them to do our bidding. No, we haven't embraced "mad" science. &lt;br /&gt;&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;During the power work&amp;nbsp; we decided to alter the layout of 243d. Historically, the room had housed a mainframe including operators booths. One of these booths still existed within 243d, so we took down one of the walls and added a new cabinet.&lt;br /&gt;&lt;br /&gt;While the work was being conducted to remove the wall we covered the cluster and powered it off to minimise dust ingestion. If you wish to gift wrap a cluster we have plenty of experience in this field. However, our wrapping is limited to blue plastic presently.&lt;br /&gt;&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/-5LTtzvv8RYc/TlO6jr80rAI/AAAAAAAAAIs/k9v10BKsg2g/s1600/cluster.jpg" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="191" src="http://2.bp.blogspot.com/-5LTtzvv8RYc/TlO6jr80rAI/AAAAAAAAAIs/k9v10BKsg2g/s320/cluster.jpg" width="320" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;&lt;br /&gt;After the wall had been removed, we cleared out the computer room and re-organised the storage cabinets, cabling and computing cabinets. In 243d there were a pile of 6 year old disused worker nodes and racked worker nodes whose PDU had been damaged during one of our many power cuts over the last 12 months. In addition to this we found and rebuilt a Dell Rack and also we had a spare Nortel 5510 switch.&lt;br /&gt;&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://1.bp.blogspot.com/-_lmC7qanYVo/TlO7HsPfoxI/AAAAAAAAAIw/rbhx03NHQPg/s1600/IMG_0331.jpg" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" src="http://1.bp.blogspot.com/-_lmC7qanYVo/TlO7HsPfoxI/AAAAAAAAAIw/rbhx03NHQPg/s1600/IMG_0331.jpg" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;With the newly available space from the removal of the wall in 243d, we got a tile cut and deployed the rack. The rack connects back to the older Stack01 via a copper gigabit Ethernet connection. This deployment will give us up to approximately 100 job slots once they are fully configured.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-134470251920865229?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/134470251920865229/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=134470251920865229' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/134470251920865229'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/134470251920865229'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2011/08/night-of-return-of-living-worker-nodes.html' title='Night of the Return of the Living Worker Nodes'/><author><name>Mark Mitchell</name><uri>http://www.blogger.com/profile/11230312300719381277</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='18' height='32' src='http://4.bp.blogspot.com/_1-iOiS0Iq18/SmyuhPjhJFI/AAAAAAAAAAM/4V_tW8Lquz0/S220/CNV00029.JPG'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://2.bp.blogspot.com/-5LTtzvv8RYc/TlO6jr80rAI/AAAAAAAAAIs/k9v10BKsg2g/s72-c/cluster.jpg' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-4713062177314035054</id><published>2011-08-12T16:20:00.004+01:00</published><updated>2011-08-12T16:22:48.717+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='Network'/><category scheme='http://www.blogger.com/atom/ns#' term='Storage'/><title type='text'>Running at capacity again</title><content type='html'>&lt;br /&gt;... after the shutdown.  Slightly delayed due to a coming back during a low point in Atlas work, which is now past us.&lt;br /&gt;&lt;br /&gt;Here's a graph of data moved from our storage element, and you can probably pick out the rather subtle peak when the last batch of analysis traffic started (taking us up to capacity):&lt;br /&gt;&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://1.bp.blogspot.com/-jAnPndhceQ4/TkVFH85WwmI/AAAAAAAAABs/5imPc6kac0w/s1600/SmallAmountOfNetworkTraffic.gif"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 320px; height: 111px;" src="http://1.bp.blogspot.com/-jAnPndhceQ4/TkVFH85WwmI/AAAAAAAAABs/5imPc6kac0w/s320/SmallAmountOfNetworkTraffic.gif" border="0" alt=""id="BLOGGER_PHOTO_ID_5639990111259509346" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-4713062177314035054?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/4713062177314035054/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=4713062177314035054' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4713062177314035054'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4713062177314035054'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2011/08/running-at-capacity-again.html' title='Running at capacity again'/><author><name>Stuart Purdie</name><uri>http://www.blogger.com/profile/08473287949581285669</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://1.bp.blogspot.com/-jAnPndhceQ4/TkVFH85WwmI/AAAAAAAAABs/5imPc6kac0w/s72-c/SmallAmountOfNetworkTraffic.gif' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-1277175804409965589</id><published>2011-08-10T16:45:00.002+01:00</published><updated>2011-08-10T16:50:24.510+01:00</updated><title type='text'>Power startup, situation (hopefully) normal</title><content type='html'>The planned power work in the Kelvin Building was completed this morning and we have been transferred back to our proper power feed from the generators. The power startup went smoothly and the building has returned to normal.&lt;br /&gt;&lt;br /&gt;The Scotgrid cluster was restarted after the power was seen to be stable and we came out of downtime at 2.20 pm. We will monitor our situation, but we hope that this power work will improve our stability over the coming months.&lt;br /&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-1277175804409965589?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/1277175804409965589/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=1277175804409965589' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/1277175804409965589'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/1277175804409965589'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2011/08/power-startup-situation-hopefully.html' title='Power startup, situation (hopefully) normal'/><author><name>David Crooks</name><uri>http://www.blogger.com/profile/07412551479798045933</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-1519484715440773460</id><published>2011-08-03T21:17:00.003+01:00</published><updated>2011-08-03T21:17:45.950+01:00</updated><title type='text'>Controlled Shut Down. Please standby.</title><content type='html'>As many regular readers of our blog may have noticed, we have had several power cuts over the last 8 months. While the Scot Grid Glasgow cluster has survived relatively well with these interruptions,&amp;nbsp; the School of Physics and Astronomy at the University of Glasgow has under taken a piece of work to resolve this re-current issue.&lt;br /&gt;&lt;br /&gt;Therefore, on the 7th - 10th of August we will be going into a controlled downtime period so that the transformers which supply the mains feed into our site can be removed and upgraded.&lt;br /&gt;&lt;br /&gt;We should be back in action on the morning of Wednesday the 10th. &lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-1519484715440773460?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/1519484715440773460/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=1519484715440773460' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/1519484715440773460'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/1519484715440773460'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2011/08/controlled-shut-down-please-standby.html' title='Controlled Shut Down. Please standby.'/><author><name>Mark Mitchell</name><uri>http://www.blogger.com/profile/11230312300719381277</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='18' height='32' src='http://4.bp.blogspot.com/_1-iOiS0Iq18/SmyuhPjhJFI/AAAAAAAAAAM/4V_tW8Lquz0/S220/CNV00029.JPG'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-4415356132344703779</id><published>2011-07-28T21:12:00.003+01:00</published><updated>2011-08-03T21:20:35.253+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='Network'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><category scheme='http://www.blogger.com/atom/ns#' term='power outage'/><title type='text'>Circuits, Circuits everywhere but not a drop to switch</title><content type='html'>Since the late afternoon of the 26th of July we have been working to resume service on the Cluster at Glasgow.&lt;br /&gt;We were put into unexpected downtime by our old friend; the power cut.&lt;br /&gt;&lt;br /&gt;The root cause of this appears to be that the local mains supply into the site failed and was sub-sequentially re-instated. However, we decided to restart the cluster on Wednesday morning, to ensure that there was a clean and stable supply into the site. So off to the Gocdb, announce the unscheduled downtime and proceed.&lt;br /&gt;&lt;br /&gt;While normally we would have immediately started on getting the cluster back online, as it turned out we couldn't have got ourselves back into production any sooner due to the residual issues caused by the power outage. As we have had several power interruptions at the site over the last 10 months, we have now got a reasonably robust restart procedure and we started this on Wednesday morning.&lt;br /&gt;&lt;br /&gt;Initially, we had absolutely no issues surrounding the reset of both rooms, bar the loss of a rather expensive 10 Gig Ethernet interface on one of the new Dell Switches and the loss of the switch configuration files, which was caused by yours truly not running a &lt;i&gt;copy run start&lt;/i&gt; on the switch after configuring a LAG group and QOS. We reconfigured the switch and all connectivity across the cluster was confirmed as good.&lt;br /&gt;&lt;br /&gt;We then proceeded to rebuild our one of our internal stacks to free up the 10 Gig Interfaces on a Nortel 5530, which we had planned to move to our lower server room to build out the second 10 Gig link, mentioned in a previous post. This too went surprisingly well, but Dave and myself had pretested building the stack and adding and removing devices and inserting new base units on older test equipment.&lt;br /&gt;&lt;br /&gt;We then retested again Stacking, LAGs were working fine, Spanning tree was happy and the Cluster's network was in good shape. We then moved to phase 2 of the upgrade which was to insert the 5530 switch into the switch stack in the downstairs server room. After we inserted the switch in the stack, it came up and the entire stack stabilised and then started to forward traffic. &lt;br /&gt;&lt;br /&gt;However, about 3 minutes later we started to see the latency in the network rise and hosts fail to contact one another. Ping, SSH and normal cluster network traffic such as NFS, NTP and DNS also started to experience issues. We reduced the load on the network by detaching hosts from it but to no avail. We then removed the 5530 from the stack but the problem remained. Over the next 4 hours we tried a variety of tests which were all ending with either the dreaded Host Unreachable or 142 millisecond response times. To make matters worse (confusing), the switches were reporting an internal response time between room of 0.50 milliseconds via ping but telnet and ssh between devices was also timing out. &lt;br /&gt;&lt;br /&gt;As we were unable to ascertain the exact root cause, we called a break and went and got some air.&lt;br /&gt;&lt;br /&gt;20 minutes and one pizza slice later, it occurred&amp;nbsp; to me that if no device on the network was generating traffic at the volume required to generate a 94% packet loss scenario across multiple 10 Gig connections, then it has to be the network itself. Or rather what is attached to it.&lt;br /&gt;&lt;br /&gt;The 10 Gig Interface being cooked wasn't the cause as it was dead at this point, but the power cut had left another present:&lt;br /&gt;&lt;br /&gt;Damaged Ethernet Cables.&lt;br /&gt;&lt;br /&gt;As the Cluster is too large to manually go round and check every cable individually with a line tester, we did something that I, as a former telco engineer, don't like doing. We rebooted the switches in numbered sequence. Starting with Stack01.&lt;br /&gt;&lt;br /&gt;The purpose of this test is to isolate as quickly as possible the damaged cable, device or interface by pinging across the cluster from one room to another and intra switch if need be.&lt;br /&gt;&lt;br /&gt;So Ping from Svr001 (upstairs) to Node141 (downstairs).&lt;br /&gt;Destination Host Unreachable.&lt;br /&gt;Leave the ping running.&lt;br /&gt;Reboot Stack 01.&lt;br /&gt;Ping response time of 0.056 miliseconds&lt;br /&gt;Stack01 reloads.&lt;br /&gt;Destination Host Unreachable.&lt;br /&gt;&lt;br /&gt;We repeated this test twice. And got the same result.&lt;br /&gt;&lt;br /&gt;So onto Stack01. The partner switch which trunks into this stack to affect an uplink onto the core of our network did not report any errors on the multi-link trunk but also very little traffic. Neither did Stack01, until I tried to ping its loopback address from the partner switch. The error rate on the interfaces increased and CRC counters were recorded. So we systematically disabled the multi-link trunk link by link until the stack interconnect stablised.&lt;br /&gt;&lt;br /&gt;This reduced the trunk's capacity substantially but it also stabilised the network. So we added the 5530 back into the Stack downstairs, turned on the partner ports upstairs and were awarded with a 20 Gig backbone which is now operational at the Glasgow site.&lt;br /&gt;&lt;br /&gt;As for the old LAG connection it was stripped out completely this morning and by early afternoon we had re-instated a 6 Gig connection to Stack01 which is working happily. From here we brought the site out of downtime and are back on the Grid.&lt;br /&gt;&lt;br /&gt;We are putting in place an&amp;nbsp; internal tftp process for backing up switch configurations each night.&lt;br /&gt;&lt;br /&gt;The main lesson from this is that on a large layer 2 environment, the smallest issue can become a major one and plans are well advanced on the next set of configuration changes to the network at Glasgow, to get around this and other potential issues in the future.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-4415356132344703779?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/4415356132344703779/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=4415356132344703779' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4415356132344703779'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4415356132344703779'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2011/07/circuits-circuits-everywhere-but-not.html' title='Circuits, Circuits everywhere but not a drop to switch'/><author><name>Mark Mitchell</name><uri>http://www.blogger.com/profile/11230312300719381277</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='18' height='32' src='http://4.bp.blogspot.com/_1-iOiS0Iq18/SmyuhPjhJFI/AAAAAAAAAAM/4V_tW8Lquz0/S220/CNV00029.JPG'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-8154404312210428015</id><published>2011-07-14T15:23:00.000+01:00</published><updated>2011-07-14T15:23:03.531+01:00</updated><title type='text'>We make knowledge possible</title><content type='html'>Just a quick Blog post in regards to the the WLCG workshop held at DESY in Hamburg from the 11th to 13th of July.&lt;br /&gt;The various presentations covered aspects of all the experiments&amp;nbsp; and the future requirements for systems, storage, monitoring and networks. &lt;br /&gt;Links to the workshop agenda and content can be found here:&lt;br /&gt;https://indico.cern.ch/conferenceDisplay.py?confId=124407&lt;br /&gt;&lt;br /&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-8154404312210428015?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/8154404312210428015/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=8154404312210428015' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/8154404312210428015'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/8154404312210428015'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2011/07/we-make-knowledge-possible.html' title='We make knowledge possible'/><author><name>Mark Mitchell</name><uri>http://www.blogger.com/profile/11230312300719381277</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='18' height='32' src='http://4.bp.blogspot.com/_1-iOiS0Iq18/SmyuhPjhJFI/AAAAAAAAAAM/4V_tW8Lquz0/S220/CNV00029.JPG'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-3020419968810847802</id><published>2011-07-11T14:40:00.000+01:00</published><updated>2011-07-11T14:40:28.729+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='cvmfs'/><category scheme='http://www.blogger.com/atom/ns#' term='glasgow'/><title type='text'>Everyone's doing a brand new filesystem now: Come on, baby, do the cvmfs now.</title><content type='html'>&lt;blockquote&gt;&lt;/blockquote&gt;Ever since I heard about it at CHEP 2010, I've been itching to get CVMFS set up at Glasgow, because it was so clearly a better solution for software provision than the old sgm-role / NFS-mounted area approach. &lt;br /&gt;&lt;div&gt;Concerns about the reliability of the hardware that the service was running on (it may still not be on production hardware at CERN as I write this) always held the more sensible minds here back, but now that it's all up and working at RAL, and RAL is providing a stratum-1 cache as a backup, there's nothing stopping us.&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;So, following a combination of &lt;a href="http://www.gridpp.ac.uk/wiki/CVMFS_configuration_at_RAL"&gt;Ian Collier's description of the set-up at RAL&lt;/a&gt; and the &lt;a href="https://cernvm.cern.ch/project/trac/downloads/cernvm/cvmfstech-0.2.70-1.pdf"&gt;official CernVMFS technical report&lt;/a&gt; (pdf), with some adjustments to make changes to our Cfengine config, I spent some of last week getting cvmfs working on the cluster.&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;For your edification, this is what I did:&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;1) First, set up the new repository you need. In our case, yum repositories (and gpg keys) are managed by cfengine, so, in our cfengine skel directory for the worker nodes, I added:&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;&lt;/div&gt;&lt;blockquote style="font: monospace;"&gt;&lt;div&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;wget http://cvmrepo.web.cern.ch/cvmrepo/yum/cernvm.repo  -P ./skel/workers/etc/yum.repos.d/ &lt;/span&gt;&lt;/div&gt;&lt;div&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;wget http://cvmrepo.web.cern.ch/cvmrepo/yum/RPM-GPG-KEY-CernVM -P ./skel/workers/etc/pki/rpm-gpg/&lt;/span&gt;&lt;/div&gt;&lt;/blockquote&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;2) Fuse and cvmfs both want to have user and group entries created for them. We manage users and groups with cfengine, so I added a fuse group to /etc/groups and a cvmfs user and group. The cvmfs user also needs to be added as a member of the fuse group.&amp;nbsp;&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;3) Now that the initial set-up bits are done, the new packages can be installed, again, using cfengine. I added the packages &lt;/div&gt;&lt;div&gt;&lt;blockquote&gt;&lt;/blockquote&gt;&lt;blockquote&gt;fuse ; fuse-libs ; cvmfs ; cvmfs-keys ; cvmfs-init-scripts&lt;/blockquote&gt;&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;to the default packages for our worker node class in cfengine.&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;4) Editing configuration files.&lt;/div&gt;&lt;div&gt;You need to edit auto.master to get autofs to support cvmfs.&lt;/div&gt;&lt;div&gt;(Just add a line like&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;&lt;blockquote&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;/cvmfs&lt;span class="Apple-tab-span" style="white-space: pre;"&gt; &lt;/span&gt;/etc/auto.cvmfs&lt;/span&gt;&lt;/blockquote&gt;&lt;/div&gt;&lt;div&gt;as the auto.cvmfs map is added by the cvmfs rpm.&lt;/div&gt;&lt;div&gt;Remember to issue a: &lt;/div&gt;&lt;div&gt;&lt;span class="Apple-tab-span" style="white-space: pre;"&gt; &lt;/span&gt;service autofs reload&lt;/div&gt;&lt;div&gt;afterwards, or get your configuration management system to do so automagically for you.&lt;/div&gt;&lt;div&gt;)&lt;/div&gt;&lt;div&gt;You also need to configure fuse to allow users to access things as other users:&lt;/div&gt;&lt;div&gt;/etc/fuse.conf&lt;/div&gt;&lt;div&gt;&lt;blockquote&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;user_allow_other&lt;/span&gt;&lt;/blockquote&gt;And finally, you need to actually configure cvmfs itself. Cvmfs uses 2 main configuration files:&lt;br /&gt;&lt;blockquote&gt;&lt;/blockquote&gt;&lt;/div&gt;&lt;div&gt;default.local, which specifies modifications of the default settings for the local install&lt;/div&gt;&lt;div&gt;cern.ch.local, which specifies modifications of the default server to use for *.cern.ch repositories.&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;/etc/cvmfs/default.local needs to be configured for:&lt;/div&gt;&lt;br /&gt;&lt;blockquote&gt;&lt;div&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;CVMFS_USER=cvmfs&lt;/span&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;CVMFS_NFILES=32768&lt;/span&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;#CVMFS_DEBUGLOG=/tmp/cvmfs.log&lt;/span&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;CVMFS_REPOSITORIES=atlas.cern.ch,atlas-condb.cern.ch,lhcb.cern.ch,cms.cern.ch,geant4.cern.ch,sft.cern.ch&lt;/span&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;CVMFS_CACHE_BASE=/tmp/cache/cvmfs2/&lt;/span&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;CVMFS_QUOTA_LIMIT=10000&lt;/span&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;CVMFS_HTTP_PROXY="nameoflocalsquid1|nameoflocalsquid2"&lt;/span&gt;&lt;/div&gt;&lt;/blockquote&gt;&lt;br /&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;/etc/cvmfs/cern.ch.local, for UK sites should probably be configured as:&lt;/div&gt;&lt;br /&gt;&lt;blockquote&gt;&lt;div&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;CVMFS_SERVER_URL="http://cernvmfs.gridpp.rl.ac.uk/opt/@org@;http://cvmfs-stratum-one.cern.ch/opt/@org@"&lt;/span&gt;&lt;/div&gt;&lt;/blockquote&gt;&lt;br /&gt;&lt;br /&gt;&lt;div&gt;(since RAL is closer to us than CERN).&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;A brief note: ';' in a list of options specifies failover, and '|' load-balancing. So "foo;bar" means "try foo, then bar", while "foo|bar;baz" means "try to load-balance queries between foo and bar, if that fails, try baz". This works for the squid proxy specifiers in default.local and also the server destinations in cern.ch.local .&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;Another note: the cache directory specified in default.local should be large enough to actually cache a useful amount of data on each worker node. 10Gb per VO is reported to be comfortably enough, for atlas and lhcb, and therefore is probably wildly exorbitant for any other VO that would be using it. I've tested, and you can happily set this directory to be readable only by the cvmfs user, which gives you a tiny bit more security.&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;If you change the configuration files for cvmfs, you need to get it to reload them, like autofs.&lt;/div&gt;&lt;br /&gt;&lt;blockquote&gt;&lt;div&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;service cvmfs reload&lt;/span&gt;&lt;/div&gt;&lt;/blockquote&gt;&lt;br /&gt;&lt;div&gt;seems to work fine (and our cfengine config now does this if it has to update those config files).&lt;/div&gt;&lt;div&gt;&lt;br /&gt;In our case, I created the two config files, stuck them in the skel directories for worker nodes in cfengine, and added them to the list of files that are expected to be on worker nodes in the config.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;&lt;br /&gt;5 ) You can check that all this is working by trying a&amp;nbsp;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;service cvmfs probe&lt;/span&gt;&lt;br /&gt;or explicitly mounting a cvmfs path somewhere outside of automount's config.&lt;br /&gt;With the default config, atlas software is at /cvmfs/atlas.cern.ch and so on.&lt;br /&gt;&lt;br /&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-3020419968810847802?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/3020419968810847802/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=3020419968810847802' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/3020419968810847802'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/3020419968810847802'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2011/07/everyones-doing-brand-new-filesystem.html' title='Everyone&apos;s doing a brand new filesystem now: Come on, baby, do the cvmfs now.'/><author><name>Sam Skipsey</name><uri>http://www.blogger.com/profile/10165998351125446764</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-3831205974611145964</id><published>2011-07-01T16:00:00.004+01:00</published><updated>2011-08-03T21:19:01.422+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='Network'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><title type='text'>A switch port too far</title><content type='html'>As part of the ongoing upgrades surrounding the recent issues that the CEs have had when communicating with svr016, we decided to upgrade the core backbone link to 20 Gigabits. Presently, we have one 10 Gigabit trunk link between 141 and 243d, which is occasionally saturating with traffic.&lt;br /&gt;&lt;br /&gt;As previously posted, we disabled the 10 gigabit link into Stack01 and used the XFP GBIC recovered from it to facilitate this new link. Sam and I laid new fiber optic patch leads in both rooms to the patch panels and connected these to spare ports on the Core Dell 8024F and Stack02's 5530.&lt;br /&gt;&lt;br /&gt;However, the link refused to come up. After several hours investigation we acquired a fiber optic line tester which proved that light was coming through the new link. We then tested the ports on both switches with a fiber optic loop.&lt;br /&gt;&lt;br /&gt;While the port and GBIC in the 8024F looped correctly, you get a rather re-assuring green link light on the transmit and receive port, it failed on the port in Stack02. We retested the XFP in its old unit, stack01 and it came up correctly using the loop.&lt;br /&gt;&lt;br /&gt;While we are using 62.5 um patch leads which, under the standards can't be driven as far as 50 um,&amp;nbsp; we thought this may have been the issue, we confirmed that this wasn't the case through the re-testing of all the components end to end with the fiber optic meter.&lt;br /&gt;&lt;br /&gt;We cleaned out the interface slot on the Stack02 5530 with compressed air and isopropyl alcohol,&amp;nbsp; the port, while recognising the gbic correctly, did not bring up the link.&amp;nbsp; We fear that the on board optical interface is damaged, however we would need to put the site into downtime to confirm this, so we have come up with a Plan B.&lt;br /&gt;&lt;br /&gt;As we have successfully built a LAG between 141 and 243d which is in place and did not impact service at all during its commissioning, and have laid in the fiber interconnect,&amp;nbsp; we have decided to investigate moving our second 5530 into Stack02 from Stack01 to give us the 20 Gigabit uplink that we require within the core of the network.&lt;br /&gt;&lt;br /&gt;More on this after the move.&amp;nbsp; &lt;br /&gt;&lt;br /&gt;As an aside, you never know how windy cold aisles are, until you lift a floor tile. Sam is on the floor in this image and not glued to the ceiling as his hair direction may imply.&lt;br /&gt;&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/-n71CaBvN2tQ/Tg3fPN94ktI/AAAAAAAAAIY/IffzIQ4XF2A/s1600/sam.jpg" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="191" src="http://2.bp.blogspot.com/-n71CaBvN2tQ/Tg3fPN94ktI/AAAAAAAAAIY/IffzIQ4XF2A/s320/sam.jpg" width="320" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-3831205974611145964?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/3831205974611145964/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=3831205974611145964' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/3831205974611145964'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/3831205974611145964'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2011/07/switch-port-too-far.html' title='A switch port too far'/><author><name>Mark Mitchell</name><uri>http://www.blogger.com/profile/11230312300719381277</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='18' height='32' src='http://4.bp.blogspot.com/_1-iOiS0Iq18/SmyuhPjhJFI/AAAAAAAAAAM/4V_tW8Lquz0/S220/CNV00029.JPG'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://2.bp.blogspot.com/-n71CaBvN2tQ/Tg3fPN94ktI/AAAAAAAAAIY/IffzIQ4XF2A/s72-c/sam.jpg' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-5009314537924582</id><published>2011-07-01T15:20:00.005+01:00</published><updated>2011-08-03T21:21:11.450+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='Network'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><title type='text'>And after studying its behaviour, objectively and critically, we believe we have a reliable method (With apologies to Neil Fallon)</title><content type='html'>Since the last post on the blog we have implemented a series of measures on the network which were planned to be deployed during the next Cluster refresh.&lt;br /&gt;&lt;br /&gt;Primarily, we have migrated elements of our core servers such as svr020, svr001 and svr008 to the new Dell switch infrastructure and have introduced a series of Link Aggregation Groups (LAGS) across the Dell estate to raise their backbone to a full 20 Gigabits per second intra switch. This has led to the decommissioning of the core 10 Gigabit interconnect into our old Nortel gateway, stack01and this has been replaced with another LAG between the Dell's and stack01. The reason behind this will become clear in the next post.&lt;br /&gt;&lt;br /&gt;The main upshot of this part of the network upgrade is that we now can have greater control over the network services and monitoring running out of these servers such as SNTP and Gangli respectively. These can be fine tuned to a greater degree on the Dell environment to minimise the broadcast and Layer 2 multicast impact of these services.&lt;br /&gt;&lt;br /&gt;However, that is not to say that the Nortel's are on the way out quite yet. Our Torque and Maui Server, svr016, still resides on older Nortel equipment in Stack02 which is currently connected to the new Dell infrastructure by a 10 Gig fibre. This link is occasionally saturating; we have decided to upgrade the link to 20 Gigabits by running a new multimode fibre between the two computer rooms, 141 and 243d. We also decided to implement Layer 2 QOS for Server016 to ensure that it got priority over all other cluster traffic within the stack and through the core network switches.&lt;br /&gt;&lt;br /&gt;Therefore, we embarked on the re-configuration on the QOS parameters on Stack02. The complexity behind this lies not in the actual end configuration: effectively the mac address of svr016 is tracked across VLAN's 1 and 2 respectively to ensure that a Gold Quality of Service is met for any device wishing to speak to or be spoken to by Svr016. The real complexity is implementing this so that you don't disable the entire cluster attached to the network stack.&lt;br /&gt;&lt;br /&gt;Earlier implementations of the Nortel OS had a nasty tendency to drop all non-specified traffic within the network, and the QOS policy generation, while incredibly granular in its ability to tag and filter traffic, involves 6 different stages to ensure that traffic is correctly tagged and forwarded.&lt;br /&gt;&lt;br /&gt;Added to the fact that if the MAC address do not have the correct MAC address mask&amp;nbsp; all traffic generated by Svr016 will be dropped, effectively disabling the cluster for a period of time, a general picture of the care required to implement this feature developed on our part. &lt;br /&gt;&lt;br /&gt;Sam and myself rechecked the configurations twice before attempting to implement them. However, when we attempted to commit&amp;nbsp; we discovered that the Nortel GUI is a lot more thorough in its checks than we could ever have imagined. Due to a mis-configuration of the MAC address mask the system refused to commit it to the switches. It even supplied an error message which identified that the mask was wrong.&lt;br /&gt;&lt;br /&gt;Once the mask had been corrected the configuration was loaded onto stack02 and immediately started to work. The image below shows the packet matching since the 30th of June 2011.&lt;br /&gt;&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://1.bp.blogspot.com/-FCUlGKrjaoc/Tg3Rd7RJuCI/AAAAAAAAAII/2_rEJVW7_hI/s1600/QOSworking%255D.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="320" src="http://1.bp.blogspot.com/-FCUlGKrjaoc/Tg3Rd7RJuCI/AAAAAAAAAII/2_rEJVW7_hI/s320/QOSworking%255D.png" width="173" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;Now for the real test. How would it cope under increased DPM traffic loads?&lt;br /&gt;&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/-OVKyX-qQ4Zw/Tg3SZUAJfMI/AAAAAAAAAIU/LMdurrkms80/s1600/DPM.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="111" src="http://3.bp.blogspot.com/-OVKyX-qQ4Zw/Tg3SZUAJfMI/AAAAAAAAAIU/LMdurrkms80/s320/DPM.png" width="320" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;&lt;br /&gt;Surprisingly well: it turns out as now all traffic to and from svr016 has a low drop status and high precedence value across the network.&lt;br /&gt;&lt;br /&gt;The images below show the system performance during one of this recent event.&lt;br /&gt;&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/-lORCRlyiNTQ/Tg3Rm2wrCVI/AAAAAAAAAIM/Ov56C3F-2_M/s1600/svr16network.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="111" src="http://3.bp.blogspot.com/-lORCRlyiNTQ/Tg3Rm2wrCVI/AAAAAAAAAIM/Ov56C3F-2_M/s320/svr16network.png" width="320" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/-e0MwuCU5kLo/Tg3RqRNSodI/AAAAAAAAAIQ/CaG0BYoeU7U/s1600/svr016packets.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="111" src="http://3.bp.blogspot.com/-e0MwuCU5kLo/Tg3RqRNSodI/AAAAAAAAAIQ/CaG0BYoeU7U/s320/svr016packets.png" width="320" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;As can be seen, there is no real increase in activity now as the QOS mappings for svr016 now mean that, while it is still part of the production and external VLANs it always travels 1st class.&lt;br /&gt;&lt;br /&gt;The next phase of QOS development is to start to investigate the corralling of network broadcasts for services such as NFS to see if we can reduce the background chatter on the network without impacting service.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-5009314537924582?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/5009314537924582/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=5009314537924582' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/5009314537924582'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/5009314537924582'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2011/07/and-after-studying-its-behaviour.html' title='And after studying its behaviour, objectively and critically, we believe we have a reliable method (With apologies to Neil Fallon)'/><author><name>Mark Mitchell</name><uri>http://www.blogger.com/profile/11230312300719381277</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='18' height='32' src='http://4.bp.blogspot.com/_1-iOiS0Iq18/SmyuhPjhJFI/AAAAAAAAAAM/4V_tW8Lquz0/S220/CNV00029.JPG'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://1.bp.blogspot.com/-FCUlGKrjaoc/Tg3Rd7RJuCI/AAAAAAAAAII/2_rEJVW7_hI/s72-c/QOSworking%255D.png' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-7425575976853854793</id><published>2011-06-24T15:39:00.005+01:00</published><updated>2011-07-01T16:02:06.215+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='torque'/><category scheme='http://www.blogger.com/atom/ns#' term='Network'/><category scheme='http://www.blogger.com/atom/ns#' term='Storage'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><title type='text'>The Grid is a hungry, hungry beast....</title><content type='html'>... and it eats networks.  From here begins a long, convoluted story, ending, as these often do, in something that has something that seems like it should have been obvious.&lt;br /&gt;&lt;br /&gt;We've been noticing some 'blips', during which Maui fights bravely but ultimately fails to schedule jobs.  This is generally considered rather sub-optimal.&lt;br /&gt;&lt;br /&gt;The root of it was Maui was failing with an error:&lt;br /&gt;&lt;br /&gt;&lt;pre&gt;ERROR:    cannot get node info: Premature end of message&lt;/pre&gt;&lt;br /&gt;&lt;br /&gt;That Maui error results in Maui taking a break for 15 minutes, before trying to schedule anything again.  Which is fair enough, in the face of communication errors.  Only ... Maui doesn't speak to anything except the Torque server.  Which is running on the same host.&lt;br /&gt;&lt;br /&gt;So what's actually happening here is that Torque can't talk to some node or other, and reporting that to Maui, which is then breaking.  It didn't seem right that a communication failure to a single node once should stop jobs from starting elsewhere, which prompted some deeper investigation.&lt;br /&gt;&lt;br /&gt;Looking for obvious correlations, we noticed that the scheduling blips happened right when we're running lots of analysis jobs - exactly when we &lt;span style="font-style: italic;"&gt;don't&lt;/span&gt; want scheduler blips! However, it wasn't an obvious correlation, in that sometimes running 1000 jobs at once was fine, other times 400 caused things to gum up.&lt;br /&gt;&lt;br /&gt;More worry-some than sub-optimal scheduling was that during the same time period we got occasional errors from the CE's, of the form:&lt;br /&gt;&lt;br /&gt;&lt;pre&gt;BLAH error: submission command failed (exit code = 1)&amp;nbsp;&lt;/pre&gt;&lt;pre&gt;(stdout:)&amp;nbsp;&lt;/pre&gt;&lt;pre&gt;(stderr:pbs_iff: cannot read reply from&amp;nbsp;&lt;/pre&gt;&lt;pre&gt;pbs_server-No Permission.-qsub:&amp;nbsp;&lt;/pre&gt;&lt;pre&gt;cannot connect to server svr016.gla.scotgrid.ac.uk&amp;nbsp;&lt;/pre&gt;&lt;pre&gt;(errno=15007) Unauthorized Request&lt;/pre&gt;&lt;br /&gt;&lt;br /&gt;Dissecting that down, the BLAH part is CREAM saying it can't submit the job, so we're looking at the pbs_iff part.  The purpose of pbs_iff is to authenticate the current user to the Torque server, so that the job is run with the correct user id (and can be checked with the ACL's on the server, if appropriate).  The next part with qsub is just reporting that it's not able to talk to the server.&lt;br /&gt;&lt;br /&gt;The root problem is pbs_iff not able to communicate, after which the rest of the qsub is failing for lack of authentication.  This is a problem, because these are jobs that are already accepted by the CREAM CE, and shouldn't be failed here.  (If a site can't cope with the jobs, the CE should be disabled, so it never accepts the jobs - that's the signal to the submitter/WMS to try elsewhere.)&lt;br /&gt;&lt;br /&gt;How does all this link back to the network issues?  Well, our cluster is split into two rooms - liked by a couple of fibres.&lt;br /&gt;&lt;br /&gt;During analysis, we can see 2 GB per second (yes, that's in bytes) in traffic leaving the disk servers.  Roughly half the disk and about half of the CPUs [see later!] are in each room; that implies that given a random distribution half that traffic has to pass through the fibre link.&lt;br /&gt;&lt;br /&gt;And, yep, that's the problem right there.  The Torque server unable to shout loud enough to talk to the nodes when the link is full, or be heard from some of the CE's.  Digging into the stats shows that the link is running at 83% average utilisation, over the past month.  So when analysis hits, it wipes out any other traffic.&lt;br /&gt;&lt;br /&gt;For the moment, then, I've put a cap on the number of analysis jobs until we can resolve this, as mitigation.  And sent Mark off to find some more fibre and ports on the switches!&lt;br /&gt;&lt;br /&gt;Some interesting sums: Turns out we have nearer 1/3 the CPU upstairs, and 2/3 (1200 job slots) downstairs.  Disk is close to 1/2 each.  Matching this up with the planning number of 5 MB per second 'disk spindle to analysis cpu' bandwidth suggests that we need 3 GB per second, or 24 Gbs-1 bandwidth between the rooms to run at full capacity.  Compared to 10 Gbs-1 at the moment.&lt;br /&gt;&lt;br /&gt;Hrm.  No wonder we were having difficulty!  On the other hand, it's probably been this link that's the limiting factor in our analysis throughput, so we should be able to roughly double our peak throughput of analysis jobs once that link is upgraded.&lt;br /&gt;&lt;br /&gt;That, and not have the scheduler taking a wee nap during peak times.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-7425575976853854793?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/7425575976853854793/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=7425575976853854793' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/7425575976853854793'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/7425575976853854793'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2011/06/grid-is-hungry-hungry-beast.html' title='The Grid is a hungry, hungry beast....'/><author><name>Stuart Purdie</name><uri>http://www.blogger.com/profile/08473287949581285669</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-6619590524553377180</id><published>2011-06-01T15:19:00.001+01:00</published><updated>2011-08-03T21:19:36.374+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='Network'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><title type='text'>Side Effects may include ...</title><content type='html'>On Wednesday, the 25th, the Glasgow Scotgrid site was part of the wider SSC5 Security Challenge and during the course of the challenge we encountered several issues with the network security configuration on our core switch.&lt;br /&gt;&lt;br /&gt;The configuration changes which caused issued are specifically:&lt;br /&gt;1) Access List Configuration for inbound services&lt;br /&gt;2) ICMP dos-control settings&lt;br /&gt;&lt;br /&gt;The Access List Configuration (ACL) did not accept a global default permit with a wild card mask for both IP address ranges and subnets. The key issue here is that when the Access List was applied&amp;nbsp; on an access port for inbound traffic the Access List worked correctly. However, when applied to the primary egress port onto our network switch it disabled remote connectivity into the cluster, while not impacting internal&amp;nbsp; machine to machine traffic on the cluster.&amp;nbsp; The access list was removed and remote access was restored. The root cause for this failure was traced to an incorrectly set ACL ANY permit within the list, however on further investigation each network requiring access to and from the cluster will require its own unique entry rather than a default network range with a series of denied services.&amp;nbsp; The central IT group at the University also run a series of access lists and fire walls within the edge routing and switching network to the JANET environment which can be adapted to fit our requirements within the cluster setup at Glasgow.&lt;br /&gt;&lt;br /&gt;A secondary issue;&lt;br /&gt;&lt;br /&gt;A dos-control setting which controls the maximum payload for ICMP also caused unusual network behaviour after it was implemented. Effectively by limiting the payload to 512 bytes, this caused Maui and Torque to encounter issues when attempting to communicate with one another which then impacted other services within the cluster environment, while this slowed down Torque and Maui it did not completely stop the cluster, however its removal immediately improved data connectivity within the cluster. This issue is being referred back to the manufacturer as the payload incrementation only increases to 1023 bytes presently.&lt;br /&gt;&lt;br /&gt;Once we have an update on this issue we will post it up on the blog.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-6619590524553377180?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/6619590524553377180/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=6619590524553377180' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/6619590524553377180'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/6619590524553377180'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2011/06/side-effects-may-include.html' title='Side Effects may include ...'/><author><name>Mark Mitchell</name><uri>http://www.blogger.com/profile/11230312300719381277</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='18' height='32' src='http://4.bp.blogspot.com/_1-iOiS0Iq18/SmyuhPjhJFI/AAAAAAAAAAM/4V_tW8Lquz0/S220/CNV00029.JPG'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-5740974042630964553</id><published>2011-06-01T11:39:00.002+01:00</published><updated>2011-06-01T12:07:44.869+01:00</updated><title type='text'>EGEE to EGI</title><content type='html'>We were recently asked to make sure that we were tagging our site as belonging to EGI and EGEE since the latter project has been ended for some time. This would typically involve changing a line entry in our site-info.def file and rerunning YAIM on the appropriate servers. However, as rerunning YAIM is a complete reconfiguration of a service, we decided to look into the exact alteration required to ensure that there was a low impact for the change.&lt;br /&gt;&lt;br /&gt;As of June 2011, using a glite installation, the information that is published through the site bdii is stored in the /opt/glite/etc/gip/ldif directory on each server (this would be different using an EMI installation). The exact files that are in that directory depend on the type of service that is publishing, but in this case we're interested in the glite-info-site.ldif file which is on the site bdii itself. We have (or had) 3 entries mentioning EGEE:&lt;br /&gt;&lt;br /&gt;GlueSiteOtherInfo: EGEE_ROC=UK/I&lt;br /&gt;GlueSiteOtherInfo: EGEE_SERVICE=prod&lt;br /&gt;GlueSiteOtherInfo: GRID=EGEE&lt;br /&gt;&lt;br /&gt;Of these, we have updated&lt;br /&gt;&lt;br /&gt;GlueSiteOtherInfo: GRID=EGEE&lt;br /&gt;&lt;br /&gt;to&lt;br /&gt;&lt;br /&gt;GlueSiteOtherInfo: GRID=EGI&lt;br /&gt;&lt;br /&gt;and restarted the site bdii. After a small wait for the update to appear, we are now appropriately tagged as belonging to EGI as opposed to EGEE. Discussions are now underway as to the appropriate values for the other two variables.&lt;br /&gt;&lt;br /&gt;In the site-info.def file itself (which should be updated to make sure that a future run of YAIM on the site BDII does not reverse this change)  the corresponding change in our case is:&lt;br /&gt;&lt;br /&gt;SITE_OTHER_GRID="EGEE|WLCG|SCOTGRID|GRIDPP"&lt;br /&gt;&lt;br /&gt;to&lt;br /&gt;&lt;br /&gt;SITE_OTHER_GRID="EGI|WLCG|SCOTGRID|GRIDPP"&lt;br /&gt;&lt;br /&gt;For more information see https://wiki.egi.eu/wiki/MAN01&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-5740974042630964553?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/5740974042630964553/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=5740974042630964553' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/5740974042630964553'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/5740974042630964553'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2011/06/egee-to-egi.html' title='EGEE to EGI'/><author><name>David Crooks</name><uri>http://www.blogger.com/profile/07412551479798045933</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-2876450714914837322</id><published>2011-05-06T13:35:00.003+01:00</published><updated>2011-05-06T14:06:11.527+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='arc'/><title type='text'>Arc and lcmaps</title><content type='html'>&lt;a href="/2009/11/arc-authorisation-and-lcmaps.html"&gt;Last time&lt;/a&gt; I was talking about Arc, I mentioned that there was an issue with LCMAPS, relating to the bitness of the available libraries.&lt;br /&gt;&lt;br /&gt;And that once a 64 bit LCMAPS library was available, that'd be it.&lt;br /&gt;&lt;br /&gt;Well, as you might have infered from a very slight delay, there's just a teensy bit more to it than that.&lt;br /&gt;&lt;br /&gt;64 bit libraries are now common place, and did, indeed resolve the problem we had.  However, they just turned up more problems.&lt;br /&gt;&lt;br /&gt;Cue one long, and rather frustrating search down the rabbit hole of shared library dependencies.  The root problem was that nothing was defining a symbol 'getMajorVersionNumber()', or the minor or patch number versions.  Finding what _should_ be doing that, and what those values aught to be was the tricky part.  Perhaps that's more a symptom of my not having spent very much time debugging shared library issues, rather than a sign of a genuinely hard problem.&lt;br /&gt;&lt;br /&gt;In the end, it's a known problem with the VOMS libraries, and it's not hard to correct for it in the small scale, by adding stub methods that return 0 in the application code, and compiling with -rdynamic.&lt;br /&gt;&lt;br /&gt;However, translating that into something that works for ARC is non-trival.  Recompling all of AREX to export functions to shared libraries is asking for trouble, given the size of the thing.  It's also debatable whether it's the right thing to do to work around what's really a bug in the libraries themselves.&lt;br /&gt;&lt;br /&gt;Fortunately, there is another option.  Arc can call plugins to do pool account mapping, and these are small external programs.  So writing a short wrapper around LCMAPS is straight forward, and then Arc delegates responsability to this plugin, which is a nice, self contained place to have the workarounds.  &lt;br /&gt;&lt;br /&gt;My version of such a plugin is &lt;a href="http://www.scotgrid.ac.uk/arc-lcmap.c"&gt;here&lt;/a&gt;, and should be identified in the arc.conf as&lt;br /&gt;unixgroup=mapplugin 5 arc-lcmap %D %P&lt;br /&gt;&lt;br /&gt;This now lets us use the same pool account mapping and authorisation infrastructure with both gLite and Arc.  In particular, this lets us open up the Arc CE to any of our normally supported VO's; as a option for them to explore.  That's a topic I'll be working with some VO's on over the summer.&lt;br /&gt;&lt;br /&gt;For the moment though, I need to dismantle the layer of auth systems hacks we were using for Arc.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-2876450714914837322?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/2876450714914837322/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=2876450714914837322' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/2876450714914837322'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/2876450714914837322'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2011/05/arc-and-lcmaps.html' title='Arc and lcmaps'/><author><name>Stuart Purdie</name><uri>http://www.blogger.com/profile/08473287949581285669</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-4025539464732922440</id><published>2011-04-12T07:31:00.000+01:00</published><updated>2011-04-12T07:31:18.851+01:00</updated><title type='text'>Scotgrid goes East</title><content type='html'>&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a bitly="BITLY_PROCESSED" href="http://4.bp.blogspot.com/--lapxuwpASE/TaPwzcSOTiI/AAAAAAAAAHA/He-AdziRtyw/s1600/skyline.jpg" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="240" src="http://4.bp.blogspot.com/--lapxuwpASE/TaPwzcSOTiI/AAAAAAAAAHA/He-AdziRtyw/s320/skyline.jpg" width="320" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;&lt;br /&gt;&lt;span id="goog_1603449184"&gt;&lt;/span&gt;&lt;span id="goog_1603449185"&gt;&lt;/span&gt;&lt;br /&gt;We are currently attending the &lt;a bitly="BITLY_PROCESSED" href="http://uf2011.egi.eu/"&gt;EGI User Forum&lt;/a&gt;  in Vilnius, where we will be presenting on the Earth Sciences work being  conducted at the Glasgow site. There is also a blog of the various  events going on in the conference &lt;a bitly="BITLY_PROCESSED" href="http://www.egi.eu/blog/"&gt;here&lt;/a&gt;.&lt;br /&gt;&lt;br /&gt;The main themes are around virtualisation, software deployment and most importantly the user community interaction with the Grid.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-4025539464732922440?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/4025539464732922440/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=4025539464732922440' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4025539464732922440'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4025539464732922440'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2011/04/scotgrid-goes-east.html' title='Scotgrid goes East'/><author><name>Mark Mitchell</name><uri>http://www.blogger.com/profile/11230312300719381277</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='18' height='32' src='http://4.bp.blogspot.com/_1-iOiS0Iq18/SmyuhPjhJFI/AAAAAAAAAAM/4V_tW8Lquz0/S220/CNV00029.JPG'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://4.bp.blogspot.com/--lapxuwpASE/TaPwzcSOTiI/AAAAAAAAAHA/He-AdziRtyw/s72-c/skyline.jpg' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-2238000285562400443</id><published>2011-03-23T13:12:00.000Z</published><updated>2011-03-23T13:12:38.660Z</updated><title type='text'>ScotGrid Reloaded</title><content type='html'>As it is spring, we have decided to revamp the blog.&lt;br /&gt;We will be updating the blog over the next couple of weeks and tinkering with the layout.&lt;br /&gt;Please Stand By.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-2238000285562400443?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/2238000285562400443/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=2238000285562400443' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/2238000285562400443'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/2238000285562400443'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2011/03/scotgrid-reloaded.html' title='ScotGrid Reloaded'/><author><name>Mark Mitchell</name><uri>http://www.blogger.com/profile/11230312300719381277</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='18' height='32' src='http://4.bp.blogspot.com/_1-iOiS0Iq18/SmyuhPjhJFI/AAAAAAAAAAM/4V_tW8Lquz0/S220/CNV00029.JPG'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-7945561006654776099</id><published>2011-03-23T13:05:00.000Z</published><updated>2011-03-23T13:05:54.984Z</updated><title type='text'>Spanning Tree, oh Spanning Tree</title><content type='html'>Following last week's power outages we were encountering issues with Spanning Tree reconvergence on our older switching equipment. The Nortel 5510 and 5530 switches which have been stalwarts of the Glasgow cluster install were experiencing a major rise in the number of BPDU's being transmitted, since the second power outage as well as an increase in the number of dropped packets across all interfaces. The cause of these two issues are partially inter-related. The switches had suffered a partial loss of configuration on the second power outage which resulted in several services including their NTP client and Spanning Tree to behave erratically. To resolve the Spanning Tree issue, the configuration was returned to the defaults for the protocol on the Nortel switches. This is shown below:&lt;br /&gt;&lt;br /&gt;Hello Time:&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 2 seconds&lt;br /&gt;Maximum Age Time:&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 20 seconds&lt;br /&gt;Forward Delay:&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 15 seconds&lt;br /&gt;Bridge Hello Time:&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 2 seconds&lt;br /&gt;Bridge Maximum Age Time:&amp;nbsp;&amp;nbsp;&amp;nbsp; 20 seconds&lt;br /&gt;Bridge Forward Delay:&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 15 seconds&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;This stabilised the switches within the older Cluster and reduced the volume of BPDU's that we being sent to the core switch.&lt;br /&gt;&lt;br /&gt;An overview of the Spanning Tree Protocol is available here: http://en.wikipedia.org/wiki/Spanning_Tree_Protocol &lt;br /&gt;&lt;br /&gt;The second issue surrounding problems with dropped packets and pause frames was again related to the power outage and it appears this had resulted in several dozen worker nodes having problems communicating across the switch environment. This issue was improved by the nodes being off-lined and then rebooted after the network reset.&lt;br /&gt;&lt;br /&gt;We are still monitoring the situation and will report on any other action taken if required.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-7945561006654776099?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/7945561006654776099/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=7945561006654776099' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/7945561006654776099'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/7945561006654776099'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2011/03/spanning-tree-oh-spanning-tree.html' title='Spanning Tree, oh Spanning Tree'/><author><name>Mark Mitchell</name><uri>http://www.blogger.com/profile/11230312300719381277</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='18' height='32' src='http://4.bp.blogspot.com/_1-iOiS0Iq18/SmyuhPjhJFI/AAAAAAAAAAM/4V_tW8Lquz0/S220/CNV00029.JPG'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-1452022036058159891</id><published>2011-03-21T16:46:00.000Z</published><updated>2011-03-21T16:46:50.568Z</updated><title type='text'>Power Issues Redux</title><content type='html'>On the 15th of March we encountered two power outages within the Campus supply at Glasgow University. We had to put ourselves into downtime and remove ourselves from ATLAS production to affect a recovery from these power cuts. While the UPS infrastructure held up, we thought it prudent not to expose our user community to potential disruption.&lt;br /&gt;The root cause of these outages has now been repaired and we came out of downtime on Thursday the 17th of March.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-1452022036058159891?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/1452022036058159891/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=1452022036058159891' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/1452022036058159891'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/1452022036058159891'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2011/03/power-issues-redux.html' title='Power Issues Redux'/><author><name>Mark Mitchell</name><uri>http://www.blogger.com/profile/11230312300719381277</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='18' height='32' src='http://4.bp.blogspot.com/_1-iOiS0Iq18/SmyuhPjhJFI/AAAAAAAAAAM/4V_tW8Lquz0/S220/CNV00029.JPG'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-8938365416201291998</id><published>2011-03-02T10:50:00.000Z</published><updated>2011-03-02T10:50:15.320Z</updated><title type='text'>Wide Area Wonder</title><content type='html'>After several month's of investigating asymmetric traffic flows from Glasgow to RAL, we have finally appear to have resolved the issue. Working with internal Computing Services staff at the University of Glasgow and GridPP staff at RAL we are now seeing sustained simultaneous transfer speeds around 2.3 Gig a second inbound and outbound.&lt;br /&gt;&lt;br /&gt;The commands run for tests are shown below:&lt;br /&gt;&lt;br /&gt;iperf -s -u -p 5001 -w 2M (client command to receive data)&lt;br /&gt;iperf&amp;nbsp; -d -u -p 5001 -t 600 -w 1M -c hostname -b 700M -i 30 (server command to send data)&lt;br /&gt;&lt;br /&gt;Associated network interface card and CPU loads on device one of the tests were run. &lt;br /&gt;&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a bitly="BITLY_PROCESSED" href="https://lh5.googleusercontent.com/-53j6zJ1KAGQ/TW4gaBiWnsI/AAAAAAAAAGk/cwLpIlOYmWE/s1600/cpugraph.php.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="111" src="https://lh5.googleusercontent.com/-53j6zJ1KAGQ/TW4gaBiWnsI/AAAAAAAAAGk/cwLpIlOYmWE/s320/cpugraph.php.png" width="320" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a bitly="BITLY_PROCESSED" href="https://lh4.googleusercontent.com/-CHR1KKg9_ig/TW4gmqQntfI/AAAAAAAAAGo/5ttzotzRaEk/s1600/graph.php.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="111" src="https://lh4.googleusercontent.com/-CHR1KKg9_ig/TW4gmqQntfI/AAAAAAAAAGo/5ttzotzRaEk/s320/graph.php.png" width="320" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;/div&gt;&lt;br /&gt;&lt;br /&gt;Effectively, the Glasgow site is now an extension of the Clydenet to JANET infrastructure in the west of Scotland and we will be monitoring the services over the next month to ensure that this network solution is as stable and reliable as the previous interconnection.&lt;br /&gt;&lt;br /&gt;In addition to this work we will be investigating in Glasgow the optimisation of the Layer2 to Layer 3 network infrastructure between ourselves, the University and the rest of Gridpp over the next 3 months.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-8938365416201291998?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/8938365416201291998/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=8938365416201291998' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/8938365416201291998'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/8938365416201291998'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2011/03/wide-area-wonder.html' title='Wide Area Wonder'/><author><name>Mark Mitchell</name><uri>http://www.blogger.com/profile/11230312300719381277</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='18' height='32' src='http://4.bp.blogspot.com/_1-iOiS0Iq18/SmyuhPjhJFI/AAAAAAAAAAM/4V_tW8Lquz0/S220/CNV00029.JPG'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='https://lh5.googleusercontent.com/-53j6zJ1KAGQ/TW4gaBiWnsI/AAAAAAAAAGk/cwLpIlOYmWE/s72-c/cpugraph.php.png' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-5100494733158674233</id><published>2011-02-21T16:21:00.000Z</published><updated>2011-02-21T16:21:06.272Z</updated><category scheme='http://www.blogger.com/atom/ns#' term='avoiding dairy puns'/><category scheme='http://www.blogger.com/atom/ns#' term='cream'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><category scheme='http://www.blogger.com/atom/ns#' term='LCG CE'/><category scheme='http://www.blogger.com/atom/ns#' term='Accounting'/><title type='text'>The CE is dead. Long live the CE. Nos paenitet incommodo</title><content type='html'>As part of the on-going developments to the Scot Grid cluster at Glasgow, we have decommissioned our final LCG-CE, which resided on SVR021. The removal of this CE allows us to concentrate the support and development of two CE platforms; Cream and ARC. We are planning to conduct a series of tests around the three CREAM CE's we have deployed at Glasgow in an attempt to gain a better understanding of their maximum loading potential for running jobs and how to tweak them to gain the maximum efficiency from this service.&lt;br /&gt;&lt;br /&gt;Additionally, we will be testing our availability metrics over the next month as the LCG-CE was one of the corner stones of Steve Lloyd's tests of our overall availability. This will now be monitored primarily through our SRM availability.&lt;br /&gt;&lt;br /&gt;The reasons for decommissioning the LCG-CE are that we would be removing it at some point in the near future, all the big VO's do not have issues with submitting to Cream CEs and it simplifies our internal support requirements.&lt;br /&gt;&lt;br /&gt;The new servers running Cream are svr008, svr014 and svr026.&lt;br /&gt;&lt;br /&gt;Thank you LCG-CE and goodnight.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-5100494733158674233?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/5100494733158674233/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=5100494733158674233' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/5100494733158674233'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/5100494733158674233'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2011/02/ce-is-dead-long-live-ce-nos-paenitet.html' title='The CE is dead. Long live the CE. Nos paenitet incommodo'/><author><name>Mark Mitchell</name><uri>http://www.blogger.com/profile/11230312300719381277</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='18' height='32' src='http://4.bp.blogspot.com/_1-iOiS0Iq18/SmyuhPjhJFI/AAAAAAAAAAM/4V_tW8Lquz0/S220/CNV00029.JPG'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-368773408748377174</id><published>2011-02-21T15:06:00.004Z</published><updated>2011-02-21T15:45:48.215Z</updated><category scheme='http://www.blogger.com/atom/ns#' term='avoiding dairy puns'/><category scheme='http://www.blogger.com/atom/ns#' term='cream'/><category scheme='http://www.blogger.com/atom/ns#' term='glasgow'/><title type='text'>Covering up problems with CREAM</title><content type='html'>For some days now, ScotGrid Glasgow has been operating with only CREAM CEs, having turned our final lcg-CE off around the 14th. I'll let Mark cover the details of this in his later post, but I wanted to briefly mention one of the minor configuration details that caused some problems for us initially.&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;The gridmapdir (usually in /etc/grid-security/gridmapdir ) is a somewhat integral part of the pool account mapping system in LCG/gLite services. It contains one (empty) file for each pool account, plus hard-links to them from each DN(+VOMS Role) mapped to them. Basically, it's a cheap way to ensure that you don't get multiple mapped DNs to the same account (as you can always count the number of hard-links to an inode).&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;We share our gridmapdir, over NFS, to all of our CEs, to ensure that any incoming job from a given user is consistently mapped. Unfortunately, this lead to our minor configuration gaffe (which I just fixed).&lt;/div&gt;&lt;div&gt;The lcg-CE, you see, is configured to set the ownership and permissions on the gridmapdir to 0755 root:root. This is fine for it, since lcg-CEs do strange things like running their services with root permissions, and it prevents anything else from messing up the mappings.&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;CREAM CEs (using glexec), need to have their gridmapdir as 0775 root:glexec, a change which we hadn't made when we installed them (and which probably YAIM couldn't have done for us). This meant that, for the time the CREAM CEs were installed, they've never been able to create a new mapping in the gridmapdir, as they try to do that as members of the glexec group.&lt;/div&gt;&lt;div&gt;We never really noticed this problem while we had lcg-CEs which were busy, as the lcg-CE would almost always have also received jobs from the user previously and already performed the mapping. &lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;Now that we don't have an lcg-CE, however, it started to cause some odd problems when we enabled new VOs, as the configuration seemed perfectly fine for the VO itself, but jobs would bounce off the CREAM CEs with "Failed to get the local userid with glexec" errors.&lt;/div&gt;&lt;div&gt;Obviously, this was trivially solved once we worked out what the issue was (by setting the gridmapdir's group-ownership and permissions to glexec g+w), but identifying it was a little tricky, as the default logging level for LCMAPS doesn't give many clues as to what problem it's having. &lt;/div&gt;&lt;div&gt;Turning the debug level up to 3 (in /opt/glite/etc/glexec.conf ) was sufficient to get it to log errors with gridmapdir_newlease(), however, and then, after some poking (and manual creation of DN links to see what happened), the problem became clear.&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;So, this is a cautionary tale about moving from a mixed CE environment to a monoculture (ignoring Stuart's ARC installation) - sometimes a misconfiguration in one service can be hidden by the correct functioning of the service you're just about to remove.&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-368773408748377174?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/368773408748377174/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=368773408748377174' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/368773408748377174'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/368773408748377174'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2011/02/covering-up-problems-with-cream.html' title='Covering up problems with CREAM'/><author><name>Sam Skipsey</name><uri>http://www.blogger.com/profile/10165998351125446764</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-4296478412723643836</id><published>2011-01-19T09:39:00.003Z</published><updated>2011-01-19T09:52:26.015Z</updated><category scheme='http://www.blogger.com/atom/ns#' term='ATLAS'/><category scheme='http://www.blogger.com/atom/ns#' term='awesome'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><title type='text'>My God; it's full of data-transfers!</title><content type='html'>The Great ATLAS Spacetoken Migration of 2011 kicked off yesterday evening, and with 47TB of data sitting in MCDISK at Glasgow, Brian and We decided to take the opportunity to see how fast we could push it across to DATADISK.&lt;br /&gt;So, since ATLAS Data Management on this case happens over FTS (even though the vast majority of the transfers are internal to a site), we turned up the number of slots for STAR-GLASGOW a bit, from 20 (our default) to 50 (which was fun) up to 80 (although we peaked at around 65 used).&lt;br /&gt;With effectively no limit from FTS, our data rates were... impressive. Although it's an unfair comparison (everyone else was limited by FTS, and we were mostly moving things over the internal network), we managed to hit a peak transfer rate of 1.5GB/s internally (yes, that's 12Gbit/s), and sustain at around 8Gbits/s. That equated to around 2/3s of the total UK data movement over STAR channels, or roughly 2/3s of ATLAS's total traffic in this migration. At that rate, none of our disk servers were stressed, and the network switches were intensely relaxed.&lt;br /&gt;&lt;br /&gt;Some exciting graphs follow:&lt;br /&gt;&lt;div&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://4.bp.blogspot.com/_cPmxvuJqJHY/TTa0HCf71eI/AAAAAAAAAFA/VLIZGDq8kWQ/s1600/Screen%2Bshot%2B2011-01-18%2Bat%2B16.53.46.png"&gt;&lt;img style="cursor:pointer; cursor:hand;width: 284px; height: 320px;" src="http://4.bp.blogspot.com/_cPmxvuJqJHY/TTa0HCf71eI/AAAAAAAAAFA/VLIZGDq8kWQ/s320/Screen%2Bshot%2B2011-01-18%2Bat%2B16.53.46.png" border="0" alt="" id="BLOGGER_PHOTO_ID_5563832422685726178" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://4.bp.blogspot.com/_cPmxvuJqJHY/TTa0Gx5s6rI/AAAAAAAAAE4/oJJPnrk5b7E/s1600/Screen%2Bshot%2B2011-01-18%2Bat%2B16.53.35.png"&gt;&lt;img style="cursor:pointer; cursor:hand;width: 287px; height: 320px;" src="http://4.bp.blogspot.com/_cPmxvuJqJHY/TTa0Gx5s6rI/AAAAAAAAAE4/oJJPnrk5b7E/s320/Screen%2Bshot%2B2011-01-18%2Bat%2B16.53.35.png" border="0" alt="" id="BLOGGER_PHOTO_ID_5563832418230397618" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://2.bp.blogspot.com/_cPmxvuJqJHY/TTa0GtxX9QI/AAAAAAAAAEw/58FpejZCHHM/s1600/Screen%2Bshot%2B2011-01-18%2Bat%2B16.52.49.png"&gt;&lt;img style="cursor:pointer; cursor:hand;width: 320px; height: 158px;" src="http://2.bp.blogspot.com/_cPmxvuJqJHY/TTa0GtxX9QI/AAAAAAAAAEw/58FpejZCHHM/s320/Screen%2Bshot%2B2011-01-18%2Bat%2B16.52.49.png" border="0" alt="" id="BLOGGER_PHOTO_ID_5563832417121727746" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-4296478412723643836?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/4296478412723643836/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=4296478412723643836' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4296478412723643836'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4296478412723643836'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2011/01/my-god-its-full-of-data-transfers.html' title='My God; it&apos;s full of data-transfers!'/><author><name>Sam Skipsey</name><uri>http://www.blogger.com/profile/10165998351125446764</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://4.bp.blogspot.com/_cPmxvuJqJHY/TTa0HCf71eI/AAAAAAAAAFA/VLIZGDq8kWQ/s72-c/Screen%2Bshot%2B2011-01-18%2Bat%2B16.53.46.png' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-3195223026840075769</id><published>2010-12-14T15:23:00.002Z</published><updated>2010-12-15T16:54:28.829Z</updated><category scheme='http://www.blogger.com/atom/ns#' term='cream'/><category scheme='http://www.blogger.com/atom/ns#' term='blah'/><title type='text'>Clotted CREAM</title><content type='html'>&lt;a href="http://scotgrid.blogspot.com/2010/11/stale-cream-and-maui-partitioning.html"&gt;Last time I was blogging&lt;/a&gt;, I mentioned some problem with our CREAM CE, and too many jobs in the Blah Registry.&lt;br /&gt;&lt;br /&gt;Unlike my initial theory, the all_done interval problem turned out to not be the culprit; instead it was down to the Blah Registry.&lt;br /&gt;&lt;br /&gt;CREAM splits the whole deal with being a Compute Element into two main parts: the interaction with the wider world, which is handled with some Java code using Tomcat; and the direct interaction with the batch system, called BLAH, and written in C and shell script.&lt;br /&gt;&lt;br /&gt;The Java code, which I'll refer to as CREAM, as distinct from the BLAH parts, keeps it's state in the MySQL database.  BLAH, on the other hand, uses a hand rolled indexed file, with C functions for accessing and writing data.&lt;br /&gt;&lt;br /&gt;The BLAH registry is updated by the command blah_job_registry_add after the qsub is complete; to record the mapping between the CREAM job ID and batch system job id.  This is the step were we ran into problems.  The version of CREAM we were running was set to purge jobs after about two months - and in two months we were putting just over half a million jobs through it.  &lt;br /&gt;&lt;br /&gt;With that many jobs in the registry, it was taking a noticeable time to add any job.  Further, the locking done effectively serialises access to the registry (i.e. Table locking in RDBMS parlance).  Couple that with the Atlas pilot factory's favourite habit of dumping jobs in batches of 10 to 20 at a time, and you can see how some jobs ended up taking longer than the timeout to register.  &lt;br /&gt;&lt;br /&gt;Just before we'd encountered this, there was a new version of CREAM released (glite-CREAM-3.2.8) that cut the default time before purging to about one month, and put the indices in a mmaped file; both should mitigate this problem.  We limped along with some workarounds for a bit [0], before doing that update earlier this week.  The update from 3.2.7 to 3.2.8 went very quickly, by the way; took us about 5 minutes; although we did have to manually tidy up /etc/sudoers.&lt;br /&gt;&lt;br /&gt;As it stands now, with about quarter of a million jobs in the registry, it's taking about a couple of seconds to register a job; but with occasional pauses when there are many jobs pending. Thus far it's prevented a recurrence of large number of blocked jobs, but I'll be keeping an eye on it.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;[0] The other CE's were having hardware issues, and we didn't want to have all the CE's down at once...&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-3195223026840075769?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/3195223026840075769/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=3195223026840075769' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/3195223026840075769'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/3195223026840075769'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/12/clotted-cream.html' title='Clotted CREAM'/><author><name>Stuart Purdie</name><uri>http://www.blogger.com/profile/08473287949581285669</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-8971177017345318841</id><published>2010-11-29T21:14:00.000Z</published><updated>2010-11-29T21:14:29.813Z</updated><category scheme='http://www.blogger.com/atom/ns#' term='Water Cooling'/><category scheme='http://www.blogger.com/atom/ns#' term='Air Con'/><title type='text'>Scotgrid weekend Downtime</title><content type='html'>Due to an issue with one of the environmental control units relating to our water cooling system we had to take part of the cluster down over the weekend. The issue has now been identified and rectified. Normal service was resumed this afternoon for&amp;nbsp; the entire cluster.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-8971177017345318841?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/8971177017345318841/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=8971177017345318841' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/8971177017345318841'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/8971177017345318841'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/11/scotgrid-weekend-downtime.html' title='Scotgrid weekend Downtime'/><author><name>Mark Mitchell</name><uri>http://www.blogger.com/profile/11230312300719381277</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='18' height='32' src='http://4.bp.blogspot.com/_1-iOiS0Iq18/SmyuhPjhJFI/AAAAAAAAAAM/4V_tW8Lquz0/S220/CNV00029.JPG'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-4815344232963442017</id><published>2010-11-26T16:49:00.003Z</published><updated>2010-11-26T17:17:36.935Z</updated><title type='text'>glite-APEL installation</title><content type='html'>This is my (belated) first post on the Scotgrid blog since I joined Scotgrid Glasgow in August as a   System/Data Manager, so hello to everyone.&lt;br /&gt;&lt;br /&gt;One thing that we have had planned for a while was to install a glite-APEL publishing server, which I put in place earlier this week. The install process was straightforward following these guides: &lt;a href="http://goc.grid.sinica.edu.tw/gocwiki/glite-APEL"&gt;glite-APEL GOC wiki&lt;/a&gt; and &lt;a href="http://northgrid-tech.blogspot.com/2010/07/moving-apel-to-sl5.html"&gt;Moving APEL to SL5&lt;/a&gt;. I found a couple of issues which might be interesting for anyone else installing the service, which I've written up in a wiki page on the Scotgrid wiki: &lt;a href="http://www.scotgrid.ac.uk/wiki/index.php/Glite-APEL_installation_notes"&gt;glite-APEL installation notes&lt;/a&gt; . One thing in particular that I'd be aware of (which is also mentioned in the other links above) is to make sure that keytool is linked to the correct version before running YAIM - see the &lt;a href="http://www.scotgrid.ac.uk/wiki/index.php/Glite-APEL_installation_notes"&gt;wiki link&lt;/a&gt; for more details on what we found.&lt;br /&gt;&lt;br /&gt;Although we'll keep an eye on the new server over the next few days to make sure that it is behaving correctly, everything seems to have gone smoothly.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-4815344232963442017?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/4815344232963442017/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=4815344232963442017' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4815344232963442017'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4815344232963442017'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/11/glite-apel-installation.html' title='glite-APEL installation'/><author><name>David Crooks</name><uri>http://www.blogger.com/profile/07412551479798045933</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-7316794071712770219</id><published>2010-11-26T14:40:00.000Z</published><updated>2010-11-26T14:40:56.177Z</updated><title type='text'>Second Cream CE for Glasgow Second Steps</title><content type='html'>The install of the second Cream CE has now been completed after a series of small set backs surrounding the validity of the software image held on our mirror in Glasgow, which has now been updated.&lt;br /&gt;&lt;br /&gt;The commands for the install are available on the ScotGrid Wiki:&lt;br /&gt;http://www.scotgrid.ac.uk/wiki/index.php/Glasgow_GLite_Cream_CE_installation&lt;br /&gt;&lt;br /&gt;After testing that the CE was publishing correctly for the cluster and could accept jobs. It was successfully tested with ATLAS pilot jobs. From this point its status in the GOCDB was changed from an LCG-CE to a Cream-CE and it has now entered production. We will monitor this new CE to make sure that it is functioning optimally over the next couple of weeks.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-7316794071712770219?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/7316794071712770219/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=7316794071712770219' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/7316794071712770219'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/7316794071712770219'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/11/second-cream-ce-for-glasgow-second.html' title='Second Cream CE for Glasgow Second Steps'/><author><name>Mark Mitchell</name><uri>http://www.blogger.com/profile/11230312300719381277</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='18' height='32' src='http://4.bp.blogspot.com/_1-iOiS0Iq18/SmyuhPjhJFI/AAAAAAAAAAM/4V_tW8Lquz0/S220/CNV00029.JPG'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-1153101138382738777</id><published>2010-11-25T18:57:00.000Z</published><updated>2010-11-25T18:57:44.931Z</updated><category scheme='http://www.blogger.com/atom/ns#' term='ATLAS'/><category scheme='http://www.blogger.com/atom/ns#' term='DPM'/><category scheme='http://www.blogger.com/atom/ns#' term='Users'/><title type='text'>Woops, there go the pool accounts...</title><content type='html'>We got a ticket on Tuesday because an ATLAS user couldn't get their files back from one disk server which had run out of its 600 (!) ATLAS mapped pool accounts.&lt;br /&gt;&lt;br /&gt;I did a bit of a hacky clean up, but actually this is very safe because, unlike a CE, there are no files involved to be mis-inherited by a subsequent user. The only issue would occur at the very moment that a user tried to transfer files.&lt;br /&gt;&lt;br /&gt;The clean up removed the oldest mappings, and even the busiest server was down to ~150 mappings and ~450 free slots, so adequate breathing room was gained.&lt;br /&gt;&lt;br /&gt;Sam is going to think about this in the storage group concept and write a more general tidier-upper for all VOs.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-1153101138382738777?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/1153101138382738777/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=1153101138382738777' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/1153101138382738777'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/1153101138382738777'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/11/woops-there-go-pool-accounts.html' title='Woops, there go the pool accounts...'/><author><name>Graeme Stewart</name><uri>http://www.blogger.com/profile/04113191724360870254</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='21' height='32' src='http://www.physics.gla.ac.uk/~graeme/graeme.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-8958747598819945974</id><published>2010-11-25T09:56:00.005Z</published><updated>2010-11-25T18:51:07.923Z</updated><category scheme='http://www.blogger.com/atom/ns#' term='cream'/><category scheme='http://www.blogger.com/atom/ns#' term='maui'/><title type='text'>Stale CREAM and Maui partitioning</title><content type='html'>Nothing terribly exciting; but we've done a bit of an update on our Maui configuration, and CREAM problem has been cleared a-whey.&lt;br /&gt;&lt;br /&gt;Previously, we've had our different era's of compute nodes annotated with the key speed notifier; so that the scheduler understands the fact that some are faster.  For the vast majority of grid jobs, this is an utterly irrelevant distinction - so long as the job gets the time it expects (and Maui is scaling requests that go to the slower nodes so they get longer).&lt;br /&gt;&lt;br /&gt;However jobs that use more than one process (i.e. MPI jobs) are a different case - if they get scheduled with different classes of nodes, then you get sub-optimal resource useage.  So it's useful to keep some distinction between them.  Previously we've been using reservations to restrict where jobs can go - but there's an (ill-defined) upper bound to the maximum number of overlapping reservations on a single job slot at once; too many breaks things.&lt;br /&gt;&lt;br /&gt;So we had a look at Partitions in Maui, which is really the proper way to handle these.  The downside is that you're limited to 3 partitions - there's a compiled in limit of 4, one of which is the special [ALL] partition, and one is DEFAULT.  Fortunately, we have 3 era's of kit - and as long as we're happy calling one 'DEFAULT', it all works.  And Maui understands never to schedule a job to more than one partition at a time.&lt;br /&gt;&lt;br /&gt;So we ended up with a lot of lines like:&lt;br /&gt;&lt;pre&gt;    NODECFG[node060] SPEED=0.94 PARTITION=cvold&lt;/pre&gt;But in order to make them used by all jobs, we had to adjust the default partitions available to include them all:&lt;br /&gt;&lt;pre&gt;    SYSCFG PLIST=DEFAULT:cvold&lt;/pre&gt;which gives all users equal access to all the partitions.&lt;br /&gt;&lt;br /&gt;Not terribly exciting Maui tweaks, but sometimes that's the way of it.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;The CREAM problem manifested itself as a set of jobs that have gone sour.  There's a &lt;a href="http://grid.pd.infn.it/cream/field.php?n=Main.ErrorMessagesReportedByCREAMToClient#fail999"&gt;known problem&lt;/a&gt; in the current versions of CREAM where if the job is queueing on PBS for over an hour, CREAM (rather, the BLAH parser) thinks it's dead, and kills the job, with reason=999.&lt;br /&gt;&lt;br /&gt;What's not made explicit is that you need to have no spaces in that! i.e. you must have&lt;br /&gt;&lt;br /&gt;&lt;pre&gt;    alldone_interval=7200&lt;/pre&gt;&lt;br /&gt;because if you put alldone_interval = 7200, then CREAM doesn't understand that.  So fixed that, and it was all hunky dory for a while.  Then we started getting lots of blocked jobs in Torque again; all from CREAM.&lt;br /&gt;&lt;br /&gt;Cue more digging.&lt;br /&gt;&lt;br /&gt;Eventually found this in the CREAM logs (after a slight reformatting):&lt;br /&gt;&lt;br /&gt;&lt;span style="font-family:courier new;"&gt;  JOB CREAM942835851 STATUS CHANGED: PENDING =&gt; ABORTED&lt;br /&gt; [failureReason=BLAH error: no jobId in submission script's output&lt;br /&gt; (stdout:) (stderr:/opt/glite/etc/blah.config: line 81: alldone_interval: command not found-&lt;br /&gt; &lt;blah&gt; execute_cmd: 200 seconds timeout expired, killing child process. &lt;/blah&gt;)&lt;br /&gt;&lt;/span&gt;&lt;br /&gt;So, two things here.  Firstly, alldone_interval with spaces had crept back in, along with the correct version - in our case via cfengine directives, probably down to the &lt;a href="http://scotgrid.blogspot.com/2010/11/second-cream-ce-for-glasgow-first-steps.html"&gt;Double CREAM&lt;/a&gt; plan.  More interesting was that having the invalid part of the BLAH config present slows down BLAH (BUpdaterPBS was pegging a CPU at 100%), sufficent that it hits another timeout at 200 seconds to respond at all.  And then CREAM kills the job, but doesn't actually tell Torque, sand box is blown away, so it can't finish (nowhere to put output), or, if not  started, can't start.&lt;br /&gt;&lt;br /&gt;&lt;s&gt;Removing the second (wrong) version of the alldone_interval fixed that - CPU use in the parser dropped to trival levels, and all appears to be happy again.  This one's not really CREAMS fault, but it's always good to have an idea of what misconfigured services end up doing, otherwise it's hard to fix those 'not enough coffee' incidents.  Hence, this one for Google...&lt;/s&gt;&lt;br /&gt;&lt;br /&gt;UPDATE: Oops! Spoke too soon.  That's the problem defined, but clearly not the solution - as it's happened again. Gonna leave this here as a reminder to self to give it a bit longer before considering something fixed...&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-8958747598819945974?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/8958747598819945974/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=8958747598819945974' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/8958747598819945974'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/8958747598819945974'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/11/stale-cream-and-maui-partitioning.html' title='Stale CREAM and Maui partitioning'/><author><name>Stuart Purdie</name><uri>http://www.blogger.com/profile/08473287949581285669</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-6419747010903989421</id><published>2010-11-19T16:44:00.000Z</published><updated>2010-11-19T16:44:12.425Z</updated><category scheme='http://www.blogger.com/atom/ns#' term='scotgrid-gla'/><category scheme='http://www.blogger.com/atom/ns#' term='Second Cream CE'/><category scheme='http://www.blogger.com/atom/ns#' term='LCG CE'/><title type='text'>Second Cream CE for Glasgow First steps</title><content type='html'>&lt;div style="font-family: Arial,Helvetica,sans-serif;"&gt;&lt;span style="font-size: small;"&gt;We are currently in the process of installing a second Cream CE at Glasgow. This will replace one of the LCG CEs at Glasgow. As this is my first major service install since joining ScotGrid and the Gridpp project at the end of August I thought I would share the process for this type of service change with the wider community.&lt;/span&gt;&lt;/div&gt;&lt;div style="font-family: Arial,Helvetica,sans-serif;"&gt;&lt;span style="font-size: small;"&gt;&lt;br /&gt;&lt;/span&gt;&lt;/div&gt;&lt;div style="font-family: Arial,Helvetica,sans-serif;"&gt;&lt;span style="font-size: small;"&gt;&lt;br /&gt;&lt;/span&gt;&lt;/div&gt;&lt;span style="font-family: Arial,Helvetica,sans-serif; font-size: small;"&gt;The first steps undertaken by myself was to drain the current LCG-CE to prepare it for the new install, the commands are shown below. &lt;/span&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;" &lt;span style="font-family: Arial,Helvetica,sans-serif;"&gt;For multiple CE's with shared queues.  Edit the gip file on the CE you wish to drain.  This blocks WMS submission:&amp;nbsp;&lt;/span&gt;&lt;br /&gt;&lt;br /&gt;&lt;pre&gt;vim /opt/lcg/libexec/lcg-info-dynamic-pbs&lt;br /&gt;&lt;br /&gt;change: push @output, "GlueCEStateStatus: $Status\n"&lt;br /&gt;to: push @output, "GlueCEStateStatus: Draining\n" "&lt;/pre&gt;&lt;br /&gt;&lt;br /&gt;" &lt;span style="font-family: Arial,Helvetica,sans-serif;"&gt;on the batch machine: vim /etc/hosts.equiv comment out the machine you wish to stop accepting jobs and restart maui:&amp;nbsp;&lt;/span&gt;&lt;br /&gt;&lt;br /&gt;&lt;pre&gt;svr016:~# cat /etc/hosts.equiv&lt;/pre&gt;&lt;pre&gt;svr021.gla.scotgrid.ac.uk&lt;br /&gt;#svr026.gla.scotgrid.ac.uk "&lt;/pre&gt;&lt;pre&gt;&amp;nbsp;&lt;/pre&gt;&lt;pre&gt;&lt;/pre&gt;&lt;pre&gt;&lt;span style="font-family: Arial,Helvetica,sans-serif; font-size: small;"&gt;However the GOCDB was not updated by myself to indicate scheduled &lt;br /&gt;downtime for this service change and after a GGUS ticket this was quickly&lt;br /&gt;rectified.&amp;nbsp; We are waiting on the jobs to drain from the LCG-CE just now&lt;br /&gt; before continuing with the install early next week.&lt;/span&gt;&lt;/pre&gt;&lt;pre&gt;&lt;/pre&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-6419747010903989421?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/6419747010903989421/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=6419747010903989421' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/6419747010903989421'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/6419747010903989421'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/11/second-cream-ce-for-glasgow-first-steps.html' title='Second Cream CE for Glasgow First steps'/><author><name>Mark Mitchell</name><uri>http://www.blogger.com/profile/11230312300719381277</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='18' height='32' src='http://4.bp.blogspot.com/_1-iOiS0Iq18/SmyuhPjhJFI/AAAAAAAAAAM/4V_tW8Lquz0/S220/CNV00029.JPG'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-5502123719651785294</id><published>2010-11-02T17:21:00.000Z</published><updated>2010-11-02T17:21:08.882Z</updated><category scheme='http://www.blogger.com/atom/ns#' term='power outage'/><title type='text'>Normal Services Resume</title><content type='html'>On Friday the 29th of October, the ScotGrid, Glasgow site was impacted by two power outages at 15:25 and 15:40. These power cuts weren't localised to just the ScotGrid Glasgow site but also impacted other parts of the west end of Glasgow. These outages resulted in the site being placed in unscheduled downtime as we wanted to ensure that the power feed into the site was stable prior to returning the site to full production.&lt;br /&gt;&lt;br /&gt;On Monday the 1st of November we re-checked all essential core services, boosted our UPS capability and then re-checked all services were functioning correctly prior to the site re-entering full production.&lt;br /&gt;By 17:15 on Monday night we were expecting ATLAS jobs and the site is now back to a normal functioning basis.&lt;br /&gt;&lt;br /&gt;Interestingly enough our new 10 Gig Core re-acted as planned and rebooted in full operational mode minutes after each outage and was completely stable over the weekend, the new cluster equipment was also functioning correctly after both outages. In addition to this the older cluster equipment was not&amp;nbsp; badly affected by these power losses either. &lt;br /&gt;&lt;br /&gt;The site is now getting back to a normal functioning status.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-5502123719651785294?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/5502123719651785294/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=5502123719651785294' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/5502123719651785294'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/5502123719651785294'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/11/normal-services-resume.html' title='Normal Services Resume'/><author><name>Mark Mitchell</name><uri>http://www.blogger.com/profile/11230312300719381277</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='18' height='32' src='http://4.bp.blogspot.com/_1-iOiS0Iq18/SmyuhPjhJFI/AAAAAAAAAAM/4V_tW8Lquz0/S220/CNV00029.JPG'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-8353748452886882939</id><published>2010-10-19T05:34:00.004+01:00</published><updated>2010-10-19T05:54:57.423+01:00</updated><title type='text'>CHEP 2010</title><content type='html'>If things have appeared to be quiet these days, it's mostly because they're anything but!  A few changes in staff and new hardware are directing attention; along with conference prep.&lt;br /&gt;&lt;br /&gt;Which is where I am right now; CHEP 2010 in Taiwan.  And since we arrived it's been raining constantly; makes me feel right at home!&lt;br /&gt;&lt;br /&gt;In addition to presenting our work with ARC, it's also interesting to see what's going on elsewhere.  From the &lt;a href="http://117.103.105.177/MaKaC/sessionDisplay.py?sessionId=31&amp;amp;slotId=0&amp;amp;confId=3#2010-10-19"&gt;same session&lt;/a&gt; that I was speaking in, there was a talk about Virtual Machine optimisation - which I think will be worth a look when we get back home.  It appears that doing some small tweaks can reduce the overhead, in particular the idle time CPU consumption.  Although we don't do major computation inside the VM's, by using them for services they spend a good portion of their time idle - so tuning that might be a cunning plan for us.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-8353748452886882939?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/8353748452886882939/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=8353748452886882939' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/8353748452886882939'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/8353748452886882939'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/10/chep-2010.html' title='CHEP 2010'/><author><name>Stuart Purdie</name><uri>http://www.blogger.com/profile/08473287949581285669</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-7483433813878045077</id><published>2010-09-14T14:40:00.016+01:00</published><updated>2010-09-14T16:22:46.258+01:00</updated><title type='text'>EGI Technical Forum 2010</title><content type='html'>&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://3.bp.blogspot.com/_5kh-v9G-DCw/TI97t2amW-I/AAAAAAAAWEo/h7Cjk7Qh1MU/s1600/DSC_0185.JPG"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 320px; height: 213px;" src="http://3.bp.blogspot.com/_5kh-v9G-DCw/TI97t2amW-I/AAAAAAAAWEo/h7Cjk7Qh1MU/s320/DSC_0185.JPG" border="0" alt=""id="BLOGGER_PHOTO_ID_5516764096183557090" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;A few of us are in Amsterdam this week attending the EGI &lt;a href="http://www.egi.eu/EGITF2010/"&gt;Technical Forum&lt;/a&gt;. The rather interesting &lt;a href="https://www.egi.eu/indico/conferenceTimeTable.py?confId=48"&gt;programme&lt;/a&gt; really got underway after lunch today so, since there are three of us over here, we spread ourselves out around the many parallel meetings. &lt;br /&gt;&lt;br /&gt;Mike attended the "Virtual Research Communities" session (apparently these replace what we currently call "Virtual Organisations") and discovered a wealth of acronyms that he'd never heard of before; &lt;a href="http://www.dariah.eu/"&gt;DARIAH&lt;/a&gt;, &lt;a href="http://www.nexpres.eu/"&gt;NEXPReS&lt;/a&gt;, &lt;a href="http://envirogrids.net/"&gt;EnviroGRIDS&lt;/a&gt;, &lt;a href="http://www.enmr.eu/"&gt;e-NMR&lt;/a&gt; etc. In all there were &lt;a href="https://www.egi.eu/indico/sessionDisplay.py?sessionId=31&amp;slotId=0&amp;confId=48#2010-09-14"&gt;seven&lt;/a&gt; potential VRC's represented, each of which had a ten minute slot in which to provide a summary of their research field and outline their requirements. &lt;br /&gt;&lt;br /&gt;It turns out these seemingly disparate communities have broadly similar needs (authentication, authorization, data management etc) and don't necessarily have (or want to become) computing science experts. Who would've thought it. &lt;br /&gt;&lt;br /&gt;You know, what we need is some sort of Integrated Sustainable Pan-European Infrastructure for Researchers in Europe. &lt;a href="http://www.egi.eu/projects/egi-inspire/"&gt;Oh&lt;/a&gt;.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-7483433813878045077?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/7483433813878045077/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=7483433813878045077' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/7483433813878045077'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/7483433813878045077'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/09/few-of-us-are-in-amsterdam-this-week.html' title='EGI Technical Forum 2010'/><author><name>Mike Kenyon</name><uri>http://www.blogger.com/profile/17733966742979461363</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://3.bp.blogspot.com/_5kh-v9G-DCw/TI97t2amW-I/AAAAAAAAWEo/h7Cjk7Qh1MU/s72-c/DSC_0185.JPG' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-40509622806490093</id><published>2010-08-19T15:17:00.007+01:00</published><updated>2010-08-19T23:27:03.267+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='monitoring'/><title type='text'>Why, yes ... we were using that...</title><content type='html'>So .... remind me never to do a 'nothing much happening' post again.  It looks like tempting fate results in Interesting Times.&lt;br /&gt;&lt;br /&gt;Our cooling setup in one of the rooms is a bit quirky; based on a chilled water system (long story, but it was originally built for cooling a laser before we ended up with it).  There's been a few blips with the water supply, so duely an engineer was dispatched to have a poke at it.&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://2.bp.blogspot.com/_wviPzonOIf0/TG09LKeSlGI/AAAAAAAAABM/sZ_tUGX2xMk/s1600/ScotGrid-graph.php.png"&gt;&lt;img style="margin: 0px auto 10px; display: block; text-align: center; cursor: pointer; width: 320px; height: 114px;" src="http://2.bp.blogspot.com/_wviPzonOIf0/TG09LKeSlGI/AAAAAAAAABM/sZ_tUGX2xMk/s320/ScotGrid-graph.php.png" alt="" id="BLOGGER_PHOTO_ID_5507125181343241314" border="0" /&gt;&lt;/a&gt;The 'poke' in this cases involved switching it off, until he could delve into the midsts of the machine, resulting in the rather exciting peak in temperatures (these measured using the on board thermal sensors in the worker nodes).&lt;br /&gt;&lt;br /&gt;We were supposed to get a warning from the building systems when the chiller went offline, and again when the water supply temperature rose too high.  (The air temperature lags behind the water temp, so it's a good early warning).  As neither of those happened, our first warning was the air temperature in the room, followed by the nodes internal sensor alarms.&lt;br /&gt;&lt;br /&gt;First course of action was to offline the nodes, and then find the cause of the problem.  Once found, there was a short ... Explanation ... of why that was a Bad Time to switch off the chiller.  We'll schedule some downtime to get it done later; at some point when we're not loaded with production jobs.&lt;br /&gt;&lt;br /&gt;Still, little incidents like this are a good test for the procedures.  Everything went pretty smoothly, from offlining nodes to stop them picking up new jobs, through to the defence in depth of multiple layers of monitoring systems.&lt;br /&gt;&lt;br /&gt;Thankfully, we didn't need to do anything drastic (like hard powering off a rack); so we now know how long we have from a total failure of cooling until the effects kick in.  Time to sit down and do some sums, to make sure we could handle a cooling failure at full load that occurs at 3am...&lt;br /&gt;&lt;br /&gt;&lt;h2&gt;Update: 19/08/2010 by Mike&lt;/h2&gt;&lt;br /&gt;Never mind "sums", I took the physicist's approach a couple of years ago and got some real data:&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://1.bp.blogspot.com/_5kh-v9G-DCw/TG2sRabQIeI/AAAAAAAAWDY/wlCRkmREzcU/s1600/temprise.jpg"&gt;&lt;img style="display: block; margin: 0px auto 10px; text-align: center; cursor: pointer; width: 454px; height: 310px;" src="http://1.bp.blogspot.com/_5kh-v9G-DCw/TG2sRabQIeI/AAAAAAAAWDY/wlCRkmREzcU/s320/temprise.jpg" alt="" id="BLOGGER_PHOTO_ID_5507247334495363554" border="0" /&gt;&lt;/a&gt;&lt;br /&gt;Triangles (offset slightly along x-axis for clarity) are the temperatures of worker nodes as reckoned by IPMI; stars are input air temperatures to the three downflow units in room 141 and the squares are flow/return water temperatures. I simulated a total loss of cooling by switching the chilled water pump off; all worker nodes were operating at their maximum nominal load. It took ~20 minutes for the worker node temperatures to reach 40 degrees, at which point I bottled it and restored cooling. So, for good reason, we now run a script that monitors node temperatures, and has the ability to power them off once a temperature threshold is breached. Oh, and that has been tested in anger.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-40509622806490093?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/40509622806490093/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=40509622806490093' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/40509622806490093'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/40509622806490093'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/08/why-yes-we-were-using-that.html' title='Why, yes ... we were using that...'/><author><name>Stuart Purdie</name><uri>http://www.blogger.com/profile/08473287949581285669</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://2.bp.blogspot.com/_wviPzonOIf0/TG09LKeSlGI/AAAAAAAAABM/sZ_tUGX2xMk/s72-c/ScotGrid-graph.php.png' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-4667223871732503989</id><published>2010-08-19T11:05:00.002+01:00</published><updated>2010-08-19T13:28:27.467+01:00</updated><title type='text'>Business as unusual</title><content type='html'>There's a been a lot of little things happening up here; individually none of them quite big enough to blog about.&lt;br /&gt;&lt;br /&gt;And after a while, it's worth doing a catch up post about them.  This is that post.&lt;br /&gt;&lt;br /&gt;David started a couple of weeks ago, and Mark is starting on Monday; just in time for the GridPP meeting.  It's seeming to be a tradition that every time we get new hardware, the staff rotate; Dug and myself started just around the last hardware upgrade.&lt;br /&gt;&lt;br /&gt;The hardware this time is mostly a petabyte of storage to be added, so David's been working on ways of testing the disks before we sign off on them.&lt;br /&gt;&lt;br /&gt;GridPP; next week.  Usual round of site reports, and future planning.  With the data from the LHC now a routine matter, it's time to start thinking about future needs.  I'll be talking about non-(particle)-physcists on the Grid, as a nod towards the longer term EGI picture.&lt;br /&gt;&lt;br /&gt;We noticed some load balancing issues on our SL5 disk pool nodes; Sam's been poking at that, and it looks like there's a mix of issues, from filesystem type (ext4 is better than xfs here), and clustering of files onto nodes.&lt;br /&gt;&lt;br /&gt;And that's most of the interesting stuff from up here.  Hopefully we'll have more to post about over the next few months&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-4667223871732503989?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/4667223871732503989/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=4667223871732503989' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4667223871732503989'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4667223871732503989'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/08/business-as-unusual.html' title='Business as unusual'/><author><name>Stuart Purdie</name><uri>http://www.blogger.com/profile/08473287949581285669</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-4374981890269842385</id><published>2010-07-15T09:25:00.002+01:00</published><updated>2010-07-15T10:08:28.399+01:00</updated><title type='text'>LHCb transfers redux</title><content type='html'>Since the &lt;a href="http://scotgrid.blogspot.com/2010/05/return-of-lhcb-at-glasgow.html"&gt;last time we mentioned &lt;span class="blsp-spelling-error" id="SPELLING_ERROR_0"&gt;LHCb&lt;/span&gt;&lt;/a&gt;, we thought we had the problem licked.&lt;br /&gt;&lt;br /&gt;Sadly, we were mistaken.&lt;br /&gt;&lt;br /&gt;Like a Matryoshka doll, inside the first problem we found lurked another.  This one was more widespread, however.&lt;br /&gt;&lt;br /&gt;Although we'd fixed the problem of failing jobs, during the course of each job there were a noticeable number of transfer failures.  That is, the job first attempted to send the data back to CERN, then if that failed, tried a number of other places until it eventually worked.  Notably transfers to PIC always seemed to work fine.&lt;br /&gt;&lt;br /&gt;During some other work involving ARC, I ended up tuning the TCP stack parameters on a service node, and noticed that we were using the default parameters on our worker nodes.  This lead down a rabbit hole, till eventually finding a solution. &lt;br /&gt;&lt;br /&gt;The first idea was to tune the worker nodes for transfers to CERN, to see if making the transfers faster made more complete in time (and thus fewer failures).  Some tinkering suggested that the values that YAIM puts on a DPM pool node were decent choices, so slapped them in cfengine, and away we went.&lt;br /&gt;&lt;br /&gt;Problem cured.&lt;br /&gt;&lt;br /&gt;Surprise.&lt;br /&gt;&lt;br /&gt;Working out what was happening took a bit longer, and was down to Rob Fay at Liverpool.&lt;br /&gt;&lt;br /&gt;Part of the tuning that YAIM does is to turn of &lt;a href="http://www.faqs.org/rfcs/rfc2018.html"&gt;SACK&lt;/a&gt; and DSACK.  The other parts, about adjusting initial buffer sizes turned out not to be relevant here.  So why was SACK causing problems, and why was YAIM switching it off for the DPM pool nodes?&lt;br /&gt;&lt;br /&gt;Well, there's a bug in Linux contrac module that thinks that SACK packets are invalid, and thus won't forward them.  If it's the recipient of the packets, it's all fine, but the forwarding code was fixed in 2.6.26 (2 years ago!), and before that it would reject the SACK packets, which caused the connection to eventually revert to conventional ACKs.  SL5.3 uses a 2.6.18 kernel.&lt;br /&gt;&lt;br /&gt;As to why YAIM turns it off for DPM pool nodes; apparently because that's what YAIM did for CASTOR pool nodes at the time the YAIM module was written.  (It doesn't today).  This also explains why the transfers to PIC always worked - SACK needs both sides to agree to use it, and PIC uses a DPM (hence no SACK).&lt;br /&gt;&lt;br /&gt;So, upshot of all of that is that transferring from worker nodes to a storage element (that's not DPM) going through a NAT will be hit by this bug, crippling performance.&lt;br /&gt;&lt;br /&gt;Solutions to this are, in rough order of preference:&lt;br /&gt;1.  Always transfer to local storage and stage on from there.&lt;br /&gt;2. Don't use NATs.&lt;br /&gt;3.  If you have to transfer to remote storage, and have to use a NAT, turn off SACK and DSACK.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-4374981890269842385?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/4374981890269842385/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=4374981890269842385' title='2 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4374981890269842385'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4374981890269842385'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/07/lhcb-transfers-redux.html' title='LHCb transfers redux'/><author><name>Stuart Purdie</name><uri>http://www.blogger.com/profile/08473287949581285669</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>2</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-4763341886022467924</id><published>2010-07-09T22:44:00.003+01:00</published><updated>2010-07-09T23:14:56.017+01:00</updated><title type='text'>WLCG Workshop</title><content type='html'>ScotGrid-Glasgow was well represented this week at the &lt;a href="http://indico.cern.ch/conferenceOtherViews.py?view=standard&amp;confId=82919"&gt;WLCG Collaboration Workshop&lt;/a&gt;, with the Tier-2 coordinator, site admin and data manager in attendance.&lt;br /&gt;&lt;br /&gt;Mike gave a &lt;a href="http://indico.cern.ch/getFile.py/access?contribId=7&amp;sessionId=1&amp;resId=1&amp;materialId=1&amp;confId=82919"&gt;talk&lt;/a&gt; outlining steps taken at Glasgow and other Tier-2 sites within the UK to provide effective end user support, both in the WLCG context and also for the smaller VOs.&lt;br /&gt;&lt;br /&gt;Graeme, wearing his ATLAS hat, &lt;a href="http://indico.cern.ch/getFile.py/access?contribId=2&amp;sessionId=8&amp;resId=2&amp;materialId=slides&amp;confId=82919"&gt;presented&lt;/a&gt; the WLCG Service from the Experiments' Viewpoint.&lt;br /&gt;&lt;br /&gt;Sam took the opportunity to meet with data-management developers and experts, discussing future plans and pledging Glasgow resources in the form of development-class servers.&lt;br /&gt;&lt;br /&gt;The event was covered with photos, video and a blog over at &lt;a href="http://gridtalk-project.blogspot.com/"&gt;GridCast&lt;/a&gt;. Speaking of which...we're don't usually blow our own trumpet (too loudly) at Glasgow, but when it comes from &lt;a href="http://gridtalk-project.blogspot.com/2010/07/jamie-shiers-at-wlcg-workshop.html"&gt;this man&lt;/a&gt;, it's worth shouting about!&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-4763341886022467924?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/4763341886022467924/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=4763341886022467924' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4763341886022467924'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4763341886022467924'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/07/wlcg-workshop.html' title='WLCG Workshop'/><author><name>Mike Kenyon</name><uri>http://www.blogger.com/profile/17733966742979461363</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-3710989448461445171</id><published>2010-06-22T16:29:00.005+01:00</published><updated>2010-06-22T18:41:17.109+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='monitoring'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><title type='text'>A baffling spot of localised cooling</title><content type='html'>How do you keep your cool in this sort of weather?  Well, there's various options, but I'll bet one you've not tried is wrapping up in lots of insuating foam.&lt;br /&gt;&lt;br /&gt;And yet, that's been just the ticket for some worker nodes up here; despite it being one of the warmer days (23&lt;b&gt;°&lt;/b&gt; C outside).  Have a look at the temperature graph, and see if you can spot when something changed:&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://4.bp.blogspot.com/_wviPzonOIf0/TCDY-qS3wnI/AAAAAAAAAA8/FrdsmlGcOgU/s1600/243d-baffled.png"&gt;&lt;img style="margin: 0px auto 10px; display: block; text-align: center; cursor: pointer; width: 320px; height: 114px;" src="http://4.bp.blogspot.com/_wviPzonOIf0/TCDY-qS3wnI/AAAAAAAAAA8/FrdsmlGcOgU/s320/243d-baffled.png" alt="" id="BLOGGER_PHOTO_ID_5485622917154194034" border="0" /&gt;&lt;/a&gt;(The peak at midnight was due to a sneak attack Hammercloud; it was just before 12 when I put in the insulation.)&lt;br /&gt;&lt;br /&gt;I'd discovered that there's some empty head space at the top of the racks.  In those racks were there's a network switch at the top, this wasn't doing much, but where there were worker nodes, the top node was a lot hotter than the node two down from it.  That's a lot sharper change than I'd expected - it was noticeable by touching the metal cover on the front of the nodes.  The theory was that hot air out the back of the nodes was being sucked forward over the top of the highest node (through the headspace), and then recirculated round, getting hotter, until the steady state of it was about 5 K hotter that the others.&lt;br /&gt;&lt;br /&gt;So, it was time to do something about that.  First couple attempts at stopping up the gap didn't have much effect, until I dug out a few bits of packing foam (that the nodes were shipped in).  Being, of course, the correct width, and jut a bit taller than 1U, they fit snugly into the headspace.&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://3.bp.blogspot.com/_wviPzonOIf0/TCDcQdgfcbI/AAAAAAAAABE/Ms5e3kCbB00/s1600/DSC00020.JPG"&gt;&lt;img style="margin: 0pt 0pt 10px 10px; float: right; cursor: pointer; width: 220px; height: 165px;" src="http://3.bp.blogspot.com/_wviPzonOIf0/TCDcQdgfcbI/AAAAAAAAABE/Ms5e3kCbB00/s320/DSC00020.JPG" alt="" id="BLOGGER_PHOTO_ID_5485626521494188466" border="0" /&gt; &lt;/a&gt;&lt;br /&gt;&lt;br /&gt;And that foam baffle reduced the temperature; to the point that the node at the top of the racks are now at the lowest temperature since records began! (i.e. they were installed.)  Counter intuitive, but that's the way air/heat flow goes sometimes.&lt;br /&gt;&lt;br /&gt;Although these worker nodes are due for replacement, we're going to be reusing the racks themselves, so little things like this are good to know.   It may be that this won't be a problem with the new worker nodes - or it might be the case that it'd be worse.  Either way, forewarns is fore armed (and cooler).&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-3710989448461445171?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/3710989448461445171/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=3710989448461445171' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/3710989448461445171'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/3710989448461445171'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/06/baffling-spot-of-localised-cooling.html' title='A baffling spot of localised cooling'/><author><name>Stuart Purdie</name><uri>http://www.blogger.com/profile/08473287949581285669</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://4.bp.blogspot.com/_wviPzonOIf0/TCDY-qS3wnI/AAAAAAAAAA8/FrdsmlGcOgU/s72-c/243d-baffled.png' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-6424024595207591022</id><published>2010-06-04T21:13:00.009+01:00</published><updated>2010-06-04T22:53:22.150+01:00</updated><title type='text'>Phew, what a scorcher!</title><content type='html'>Yes, summer has arrived, even in Glasgow, and with it the people of this great city (myself included) are transformed from a &lt;a href="http://www.youtube.com/watch?v=PbWULu5_nXI"&gt;whiter-shade-of-pale&lt;/a&gt;, into something that can only be described as lobster-esque. We Celts do not tan well.&lt;br /&gt;&lt;br /&gt;Alas, it is not all fun and games in the sunshine, because the arrival of fine weather heralds the inevitable air-conditioning problems.&lt;br /&gt;&lt;br /&gt;Despite regular love and attention (serviced 3 times a year and recently hosed through with nitrogen) one of our roof-mounted compressors is particularly troublesome. This is most likely a combination of age (~12 years is the best guess) and 24 x 7 load; it serves the warmest corner of our original machine room.&lt;br /&gt;&lt;br /&gt;For this reason, I have a site meeting on Monday to discuss options, one of which will hopefully involve the replacement of said compressor before we take delivery of new hardware later this year.&lt;br /&gt;&lt;br /&gt;Also under consideration is a home-brew cold-aisle containment system. This will almost certainly be less sophisticated (and cheaper) than our excellent Knuerr racks in the basement, but should result in more intelligent use of the available chilled air.&lt;br /&gt;&lt;br /&gt;Until a solution arises, we shall continue to nurse the existing system through the summer months, and take comfort from the fact that there are clearly worse air-conditioning failures a site can suffer...&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://failblog.org/2010/06/04/epic-fail-photos-air-conditioning-fail-2/"&gt;&lt;img style="display: block; margin: 0px auto 10px; text-align: center; cursor: pointer; width: 320px; height: 211px;" src="http://2.bp.blogspot.com/_5kh-v9G-DCw/TAlpkJea6UI/AAAAAAAAVt8/JkyZx3iIMw0/s320/airconfail.jpg" alt="" id="BLOGGER_PHOTO_ID_5479026491412638018" border="0" /&gt;&lt;/a&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-6424024595207591022?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/6424024595207591022/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=6424024595207591022' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/6424024595207591022'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/6424024595207591022'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/06/phew-what-scorcher.html' title='Phew, what a scorcher!'/><author><name>Mike Kenyon</name><uri>http://www.blogger.com/profile/17733966742979461363</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://2.bp.blogspot.com/_5kh-v9G-DCw/TAlpkJea6UI/AAAAAAAAVt8/JkyZx3iIMw0/s72-c/airconfail.jpg' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-5671704548187013827</id><published>2010-05-25T14:29:00.005+01:00</published><updated>2010-05-28T20:31:57.296+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><title type='text'>So long and thanks for all the fish</title><content type='html'>I would just like to say thanks to everyone who I have worked with at ScotGrid, GridPP and EGEE.  I couldn't have picked a better time to be working on grid, LCG and WLCG.  I have learned a lot, accomplished most of the things I set out to do and hopefully contributed to the project in some small way.  I will always be on the other end of an email should you wish to get in touch.  So long and thanks for all the fish.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-5671704548187013827?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/5671704548187013827/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=5671704548187013827' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/5671704548187013827'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/5671704548187013827'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/05/so-long-and-thanks-for-all-fish.html' title='So long and thanks for all the fish'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-408769197713402256</id><published>2010-05-20T14:26:00.004+01:00</published><updated>2010-05-20T14:30:00.662+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='VM'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><category scheme='http://www.blogger.com/atom/ns#' term='glite-UI'/><title type='text'>gLite Virtual Box Image takes off</title><content type='html'>A new user today was looking for the download of our pre-built UI Virtual box image for usage with two VOS: vo.iscpif.fr and vo.complex-systems.eu&lt;br /&gt;&lt;br /&gt;The ISC-PIF (Institut des Systèmes Complexes, Paris Île-de-France) is a multidisciplinary research and training center promoting the development of French, European and international strategic projects on complex adaptive systems, construed as large networks of elements interacting locally and creating macroscopic collective behaviour.&lt;br /&gt;&lt;br /&gt;Hopefully we will get some feedback on the image and any improvements that could be made.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-408769197713402256?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/408769197713402256/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=408769197713402256' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/408769197713402256'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/408769197713402256'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/05/glite-virtual-box-image-takes-off.html' title='gLite Virtual Box Image takes off'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-1401682167744790021</id><published>2010-05-19T14:11:00.006+01:00</published><updated>2010-05-19T14:37:10.447+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='SGE'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><category scheme='http://www.blogger.com/atom/ns#' term='Lustre'/><title type='text'>SGE and Lustre</title><content type='html'>On my list of things to do was install (Sun/Oracle) Grid Engine and get a CREAM CE submitting to it on my development cluster.  So far I have SGE installed and running qsub jobs.  I am documenting the experience for those who are interested &lt;a href="https://www.scotgrid.ac.uk/wiki/index.php/SGE_at_Glasgow_Install_Instructions"&gt;here&lt;/a&gt;.  I have opted for Lustre rather than NFS 3 as it is painfully ill-equipped for the task and we have a test Lustre instance to play with so why not go the whole hog.&lt;br /&gt;&lt;br /&gt;the positives ...&lt;br /&gt;1. The wealth of documentation on the Oracle page.&lt;br /&gt;2. The interactive install is very easy to do.&lt;br /&gt;&lt;br /&gt;and the negatives ...&lt;br /&gt;1. The rpms default install location is /gridware &amp; I can't seem to get my yum repo to use a --prefix like option.  Something you can do with rpm.  Ideas welcome?&lt;br /&gt;2. The automatic install scripts having no debugging on them at all.  When they fail they just fail silently with no output or logs.  I have only managed an interactive install so far but I will try to set /bin/sh -x and see if it makes a difference.  Hopefully I can get the automatic script running so that cfengine can deal with the install of the execution hosts rather than by hand.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-1401682167744790021?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/1401682167744790021/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=1401682167744790021' title='2 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/1401682167744790021'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/1401682167744790021'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/05/sge-and-lustre.html' title='SGE and Lustre'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>2</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-2045690361660298746</id><published>2010-05-14T10:02:00.007+01:00</published><updated>2010-05-14T11:44:14.553+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='NAT'/><category scheme='http://www.blogger.com/atom/ns#' term='Transfer Tests'/><category scheme='http://www.blogger.com/atom/ns#' term='LHCb'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><category scheme='http://www.blogger.com/atom/ns#' term='gridftp'/><title type='text'>The return of LHCb at Glasgow</title><content type='html'>After weeks of investigating and debugging our LHCb transfer issue at Glasgow we have finally fixed it.  So ..... spill the beans I hear you cry.  &lt;br /&gt;&lt;br /&gt;Well in short, we had an iptables rule on the INPUT filter of the NAT that was dropping strangely behaving gridftp connections.  This was relaxed and allowed inbound connections to be established.  This has solved the issue and we still have the protection of the campus firewall for security.&lt;br /&gt;&lt;br /&gt;Strangely behaving gridftp connections, what does that mean?  Well, transfers that had failed to work first time seemed to get into an unknown state and transfer no bytes, with many RETRY packets and no FIN packet.  It appears that these connections were trying to establish inbound connections. These were then dropped by a REJECT within our iptables. &lt;br /&gt;&lt;br /&gt;Moral of the story is, if you can get external IP's for your worker nodes, use them.  NAT'ing just adds complexity especially when dealing with GLOBUS.&lt;br /&gt;&lt;br /&gt;The full story if you are interested ....&lt;br /&gt;&lt;br /&gt;Problem:  LHCb don't use FTS.  They use direct outbound gridftp transfers of job outputs.  Jobs on WN's transfer results, using the lcg-utils tools, at the end of the job to CERN and failover to various T1's if there is an issue with the CERN transfer.  LHCb have seen a large failure rate with around 50% of gridftp/lcg-cp transfers failing at Glasgow.  Brunel, Sheffield and Lancaster have been affected with the same issue although to a lesser extent.  Failure rates at the other sites are much less at around 2-3%.  We see the initial transfer timing out, failing over to a T1, this sometimes works and sometimes fails over to another T1 and so on.  Why has this not been seen sooner?  Well this has actually been there since day dot but DIRAC masked the return code of the failure.  A new version of DIRAC catches the fail-overs and are killed by their watchdog.  Thus bringing this issue to the surface.&lt;br /&gt;&lt;br /&gt;Investigation:  Glasgow looks like this &lt;span style="font-weight:bold;"&gt;WN's-&gt; NAT-&gt;CAMPUS FIREWALL-&gt;WORLD&lt;/span&gt;.   We managed to recreate the issue with a simple transfer test from varying amounts of WN's to test SRM end-points.  This recreated the issue and we saw a 50% failure rate across various SRM implementations, in particular &lt;span style="font-weight:bold;"&gt;CASTOR, DCACHE, STORM&lt;/span&gt;.  However, &lt;span style="font-weight:bold;"&gt;DPM&lt;/span&gt; transfers were 100% successful.  Failed transfers manifested themselves are lcg-cp: timed out or lcg-cp: error on send.  We repeated these tests using various VO's and got similar results so we did not think it was VO related.  We monitored the connections though our NAT and asked the firewall team to check if any outbound ports were blocked, they were not.  The &lt;span style="font-weight:bold;"&gt;GLOBUS_TCP_PORT_RANGE&lt;/span&gt; at Glasgow was set to a specific known open port range for inbound connections but this does not matter in this case of outbound connections.  To be on the safe side we set &lt;span style="font-weight:bold;"&gt;GLOBUS_TCP_SOURCE_RANGE&lt;/span&gt; for outbound connections through our NAT.  As we expected this did not make a difference.  After discussion with other sites we checked client libraries, OS and network.  One thing that did crop up was the use of NAT.&lt;br /&gt;&lt;br /&gt;The final test was 100 simultaneous transfers from one node via the NAT.  We saw a 50% failure rate.  We repeated this test but this time with an external address and no NAT routing.  This was 100% successful over 3 attempts.  Quickly repeated tests did show some failures but this was probably the firewall dropping connections.  Therefore, we were able to clearly identify the NAT as being the issue.  We tried tweaking TCP settings on the NAT i.e. &lt;span style="font-weight:bold;"&gt;tcp_fin_timeout, tcp_tw_reuse, tcp_tw_recycle, tcp_keepalive_time&lt;/span&gt; with no success.  The iptables rules themselves seemed sensible but we were still dropping 50% of the connections.&lt;br /&gt;&lt;br /&gt;We then moved to tcpdumping the tcp packets (&lt;span style="font-weight:bold;"&gt;SYN and FIN&lt;/span&gt;) from the internal (eth0) device and compared it to a tcpdump of the external (eth1) device.  You could clearly see the control channels opening, data channels opening, transfers and then around 50% of the transfers sending retry packets and never sending a &lt;span style="font-weight:bold;"&gt;FIN&lt;/span&gt;.  It looked like something was being blocked.&lt;br /&gt;&lt;br /&gt;A closer look at the iptables rules identified an entry on the &lt;span style="font-weight:bold;"&gt;INPUT&lt;/span&gt; filter that could be the culprit.  Further up the chain we were allowing &lt;span style="font-weight:bold;"&gt;RELATED,ESTABLISHED&lt;/span&gt; as you would expect. Then we had a &lt;code&gt;&lt;span style="font-weight:bold;"&gt;-A INPUT -i eth1 -p tcp -m tcp -j REJECT --reject-with tcp-reset&lt;/span&gt;&lt;/code&gt;.  It appears this entry caused attempts to re-establish the connection to fail (possibly by blocking the initial packet from the destination, erroneously considering it not to count as &lt;span style="font-weight:bold;"&gt;ESTABLISHED&lt;/span&gt; any more).  Very strange behaviour indeed.  In the plus side we generally use the campus firewall to protect us from unwanted traffic rather than our own iptables rules, so we have relaxed the &lt;span style="font-weight:bold;"&gt;INPUT&lt;/span&gt; filter and guess what, near 100% transfer success.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-2045690361660298746?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/2045690361660298746/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=2045690361660298746' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/2045690361660298746'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/2045690361660298746'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/05/return-of-lhcb-at-glasgow.html' title='The return of LHCb at Glasgow'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-1539276511807836567</id><published>2010-05-13T18:05:00.004+01:00</published><updated>2010-05-13T22:05:03.198+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='analysis'/><category scheme='http://www.blogger.com/atom/ns#' term='ATLAS'/><category scheme='http://www.blogger.com/atom/ns#' term='Users'/><title type='text'>A fistful of user jobs...</title><content type='html'>&lt;div style="margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;"&gt;No ATLAS production to do in the UK, but we have a nice full cluster anyway, with more than 1000 user jobs running:&lt;/div&gt;&lt;div style="margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;"&gt;&lt;br /&gt;&lt;/div&gt;&lt;div style="margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;"&gt;&lt;br /&gt;&lt;/div&gt;&lt;div style="margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;"&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;svr016:~# qstat -q&lt;/span&gt;&lt;/div&gt;&lt;div style="margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;"&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;&lt;br /&gt;&lt;/span&gt;&lt;/div&gt;&lt;div style="margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;"&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;server: svr016.gla.scotgrid.ac.uk&lt;/span&gt;&lt;/div&gt;&lt;div style="margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;"&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;&lt;br /&gt;&lt;/span&gt;&lt;/div&gt;&lt;div style="margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;"&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;Queue &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;Memory CPU Time Walltime Node &amp;nbsp;Run Que Lm &amp;nbsp;State&lt;/span&gt;&lt;/div&gt;&lt;div style="margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;"&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;---------------- ------ -------- -------- ---- &amp;nbsp;--- --- -- &amp;nbsp;-----&lt;/span&gt;&lt;/div&gt;&lt;div style="margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;"&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;q2d &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;-- &amp;nbsp; 48:00:00 48:00:00 &amp;nbsp; -- &amp;nbsp;134 &amp;nbsp;12 -- &amp;nbsp; E R&lt;/span&gt;&lt;/div&gt;&lt;div style="margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;"&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;atlanaly &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; -- &amp;nbsp; 24:00:00 24:00:00 &amp;nbsp; -- &amp;nbsp;738 789 -- &amp;nbsp; E R&lt;/span&gt;&lt;/div&gt;&lt;div style="margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;"&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;atlprd &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; -- &amp;nbsp; 48:00:00 48:00:00 &amp;nbsp; -- &amp;nbsp; &amp;nbsp;2 &amp;nbsp;48 -- &amp;nbsp; E R&lt;/span&gt;&lt;/div&gt;&lt;div style="margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;"&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;q7d &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;-- &amp;nbsp; 168:00:0 168:00:0 &amp;nbsp; -- &amp;nbsp; &amp;nbsp;0 &amp;nbsp; 0 -- &amp;nbsp; E R&lt;/span&gt;&lt;/div&gt;&lt;div style="margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;"&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;route2all &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;-- &amp;nbsp; &amp;nbsp; &amp;nbsp;-- &amp;nbsp; &amp;nbsp; &amp;nbsp; -- &amp;nbsp; &amp;nbsp; &amp;nbsp;-- &amp;nbsp; &amp;nbsp;0 &amp;nbsp; 0 -- &amp;nbsp; E R&lt;/span&gt;&lt;/div&gt;&lt;div style="margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;"&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;q1d &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;-- &amp;nbsp; 24:00:00 24:00:00 &amp;nbsp; -- &amp;nbsp; 94 &amp;nbsp;16 -- &amp;nbsp; E R&lt;/span&gt;&lt;/div&gt;&lt;div style="margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;"&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;mpi &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;-- &amp;nbsp; &amp;nbsp; &amp;nbsp;-- &amp;nbsp; &amp;nbsp;72:00:00 &amp;nbsp; -- &amp;nbsp; &amp;nbsp;0 &amp;nbsp; 0 -- &amp;nbsp; E R&lt;/span&gt;&lt;/div&gt;&lt;div style="margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;"&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;atlas &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;-- &amp;nbsp; 24:00:00 24:00:00 &amp;nbsp; -- &amp;nbsp;470 312 -- &amp;nbsp; E R&lt;/span&gt;&lt;/div&gt;&lt;div style="margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;"&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;lhcb &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; -- &amp;nbsp; 48:00:00 48:00:00 &amp;nbsp; -- &amp;nbsp; &amp;nbsp;0 &amp;nbsp; 0 -- &amp;nbsp; E R&lt;/span&gt;&lt;/div&gt;&lt;div style="margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;"&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;&amp;nbsp;&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; ----- -----&lt;/span&gt;&lt;/div&gt;&lt;div style="margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;"&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;&amp;nbsp;&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;1438 &amp;nbsp;1177&lt;/span&gt;&lt;/div&gt;&lt;div&gt;&lt;div style="margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;"&gt;&lt;br /&gt;&lt;/div&gt;&lt;/div&gt;&lt;div&gt;&lt;div style="margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;"&gt;&lt;span class="Apple-style-span" style="font-family: inherit;"&gt;The&amp;nbsp;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;atlanaly&amp;nbsp;&lt;/span&gt;&lt;span class="Apple-style-span" style="font-family: inherit;"&gt;queue are jobs from the panda backend and the&amp;nbsp;&lt;/span&gt;&lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;atlas&lt;/span&gt;&lt;span class="Apple-style-span" style="font-family: inherit;"&gt;&amp;nbsp;queue takes WMS backend jobs.&lt;/span&gt;&lt;/span&gt;&lt;/div&gt;&lt;/div&gt;&lt;div&gt;&lt;div style="margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;"&gt;&lt;br /&gt;&lt;/div&gt;&lt;/div&gt;&lt;div&gt;&lt;div style="margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;"&gt;Today the particular job mix was kind to the storage, with no overloads being seen, but it's something we constantly have to monitor to pre-empt problems.&lt;/div&gt;&lt;/div&gt;&lt;div&gt;&lt;div style="margin-bottom: 0px; margin-left: 0px; margin-right: 0px; margin-top: 0px;"&gt;&lt;br /&gt;&lt;i&gt;Postscript&lt;/i&gt;: I had another look and realised that most of the WMS backend jobs were from hammercloud (Sam testing SSDs!). Seems that genuine user WMS jobs were about 20-30, with more than 1000 in the panda backend.&lt;br /&gt;&lt;br /&gt;&lt;/div&gt;&lt;/div&gt;&lt;div&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-1539276511807836567?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/1539276511807836567/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=1539276511807836567' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/1539276511807836567'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/1539276511807836567'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/05/fistful-of-user-jobs.html' title='A fistful of user jobs...'/><author><name>Graeme Stewart</name><uri>http://www.blogger.com/profile/04113191724360870254</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='21' height='32' src='http://www.physics.gla.ac.uk/~graeme/graeme.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-356800796551001506</id><published>2010-05-05T10:35:00.002+01:00</published><updated>2010-05-05T10:45:54.209+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='cream'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><title type='text'>CREAM thickens</title><content type='html'>CREAM at Glasgow has been upgraded to the latest glite3.2 release 3.2.5-0.sl5 (or INFN version 1.6).  This brings lots of enhancements like&lt;br /&gt;&lt;br /&gt;&lt;a href="http://grid.pd.infn.it/cream/field.php?n=Main.Self-limitingCREAMBehavior"&gt;self limiting behaviour a'la WMS&lt;/a&gt;&lt;br /&gt;&lt;a href="http://grid.pd.infn.it/cream/field.php?n=Main.HowToConfigureTheProxyPurger"&gt;A new proxy purger to clean the delegationdb and from the file system the expired proxies &lt;/a&gt;&lt;br /&gt;&lt;a href="http://grid.pd.infn.it/cream/field.php?n=Main.HowToCustomizeTheCREAMJobWrapper"&gt;a new way to customize the job wrapper&lt;/a&gt;&lt;br /&gt;an improved proxy renewal mechanism &lt;br /&gt;and one of my favourites, support for ISB/OSB transfers from/to gridftp servers run using user credentials rather than server host certificates.  This will work well with users running gridftp servers on their own machines for example (as long as they don't turn them off when they go home at night!)&lt;br /&gt;&lt;br /&gt;you can find out more about them all &lt;a href="http://glite.web.cern.ch/glite/packages/R3.2/sl5_x86_64/deployment/glite-CREAM/3.2.5-0.sl5/glite-CREAM-3.2.5-0.sl5-update.html"&gt;here&lt;/a&gt;.  Now back to draining the WMS.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-356800796551001506?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/356800796551001506/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=356800796551001506' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/356800796551001506'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/356800796551001506'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/05/cream-thickens.html' title='CREAM thickens'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-7088681088287862640</id><published>2010-04-28T14:34:00.004+01:00</published><updated>2010-05-05T10:35:43.342+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><title type='text'>batch migrations</title><content type='html'>A week ago we finally migrated our batch system to better hardware.  This had been on the cards for a while but was expedited as we need to start a series of server moves from an old rack which will be removed come the installation of our new kit.  We also took this opportunity to upgrade the pbs server version inline with our mom version which was a little out of step.  If that wasn't enough things changing at once we also built the latest MAUI 3.3 to test how it performs.  So far so good.&lt;br /&gt;&lt;br /&gt;Next up, will be the two WMS.  Both will be put in downtime, drained and then moved out of the old rack.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-7088681088287862640?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/7088681088287862640/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=7088681088287862640' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/7088681088287862640'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/7088681088287862640'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/04/batch-migrations.html' title='batch migrations'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-4525242277324666773</id><published>2010-04-28T14:28:00.003+01:00</published><updated>2010-04-28T14:34:32.416+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='ATLAS'/><category scheme='http://www.blogger.com/atom/ns#' term='SSD'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><title type='text'>serious multi core</title><content type='html'>who would have thought it possible .... &lt;br /&gt;&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://3.bp.blogspot.com/_k7p-Ualo6NQ/S9g4YvIKWKI/AAAAAAAAABI/lh5nzFpyZ4Y/s1600/Screen+shot+2010-04-27+at+10.59.06.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 320px; height: 295px;" src="http://3.bp.blogspot.com/_k7p-Ualo6NQ/S9g4YvIKWKI/AAAAAAAAABI/lh5nzFpyZ4Y/s320/Screen+shot+2010-04-27+at+10.59.06.png" border="0" alt=""id="BLOGGER_PHOTO_ID_5465180145432811682" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;well with an large SSD, 24 cores, file stager analysis and it survives!  More soon on our testing with some cutting edge equipment.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-4525242277324666773?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/4525242277324666773/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=4525242277324666773' title='2 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4525242277324666773'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4525242277324666773'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/04/serious-multi-core.html' title='serious multi core'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://3.bp.blogspot.com/_k7p-Ualo6NQ/S9g4YvIKWKI/AAAAAAAAABI/lh5nzFpyZ4Y/s72-c/Screen+shot+2010-04-27+at+10.59.06.png' height='72' width='72'/><thr:total>2</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-3363439049923799433</id><published>2010-04-13T16:49:00.003+01:00</published><updated>2010-04-13T16:58:48.343+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='Uppsala'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><category scheme='http://www.blogger.com/atom/ns#' term='NA4'/><category scheme='http://www.blogger.com/atom/ns#' term='EGEE'/><category scheme='http://www.blogger.com/atom/ns#' term='Users'/><title type='text'></title><content type='html'>The User Forum in Uppsala continues with lots of interesting talks today.  More user focussed today with sessions from Bioinformatics, Earth Science and Computational Chemistry.  Again the buzz words of cloud, &lt;a href="http://aws.amazon.com/ec2/"&gt;EC2&lt;/a&gt;, &lt;a href="http://www.eucalyptus.com/"&gt;Eucalyptus&lt;/a&gt; and &lt;a href="http://www.opennebula.org/"&gt;Open Nebula&lt;/a&gt; continue to mentioned during the Novel Technologies and Architectures sessions.&lt;br /&gt;&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://2.bp.blogspot.com/_k7p-Ualo6NQ/S8SSmqodi-I/AAAAAAAAAAc/UPy_oprGAn8/s1600/cathedral.jpg"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 240px; height: 320px;" src="http://2.bp.blogspot.com/_k7p-Ualo6NQ/S8SSmqodi-I/AAAAAAAAAAc/UPy_oprGAn8/s320/cathedral.jpg" border="0" alt=""id="BLOGGER_PHOTO_ID_5459649841256565730" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;center&gt;The cathedral in Uppsala.&lt;/center&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-3363439049923799433?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/3363439049923799433/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=3363439049923799433' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/3363439049923799433'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/3363439049923799433'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/04/user-forum-in-uppsala-continues-with.html' title=''/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://2.bp.blogspot.com/_k7p-Ualo6NQ/S8SSmqodi-I/AAAAAAAAAAc/UPy_oprGAn8/s72-c/cathedral.jpg' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-379187182003207770</id><published>2010-04-13T06:55:00.004+01:00</published><updated>2010-04-13T07:11:34.325+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='Uppsala'/><category scheme='http://www.blogger.com/atom/ns#' term='NA4'/><category scheme='http://www.blogger.com/atom/ns#' term='EGEE'/><category scheme='http://www.blogger.com/atom/ns#' term='Users'/><title type='text'>Uppsala Begins</title><content type='html'>The last EGEE User Conference kicked off yesterday in Uppsala, Sweden.  In fact this will be the last EGEE event ever as project finally shuts it doors at the end of the month.  Even with this sad event looming everyone is in high spirits with the transition to EGI and the change that this will bring.  Monday saw the conference begin with some interesting plenaries, including the history of Uppsala University.  The 'old' building was the only building to be saved when the entire town burnt down.  The 'new' building is actually constructed from the remnants of an old boat.  Bought by the builder who was later made bankrupt by stumping up the cash in order to complete the building out of his own pocket.  You could never tell that this incredibility ornate main auditorium has columns made of cast iron and an incredibility useful bullet proof ceiling of solid steel plates!&lt;br /&gt;&lt;br /&gt;The rest of the day followed with sessions on security, user support, application porting and the 1st of two poster sessions.  Tuesday will start with two technical plenaries and the first and last EGEE photo call.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-379187182003207770?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/379187182003207770/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=379187182003207770' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/379187182003207770'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/379187182003207770'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/04/uppsala-begins.html' title='Uppsala Begins'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-4318186857970559630</id><published>2010-04-08T14:19:00.003+01:00</published><updated>2010-04-08T14:30:10.869+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='ATLAS'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><category scheme='http://www.blogger.com/atom/ns#' term='Panda'/><category scheme='http://www.blogger.com/atom/ns#' term='FTS'/><title type='text'>Take my outputs, damn you...</title><content type='html'>&lt;div style="text-align: center;"&gt;&lt;br /&gt;&lt;/div&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 400px; height: 319px;" src="http://1.bp.blogspot.com/_eLhn96sA3hw/S73YRzo-jcI/AAAAAAAAAio/BuyMeJ2w0V8/s400/week.php.png" border="0" alt="" id="BLOGGER_PHOTO_ID_5457756123874364866" /&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 400px; height: 155px;" src="http://4.bp.blogspot.com/_eLhn96sA3hw/S73YXYAjPsI/AAAAAAAAAiw/omsTa47XV_g/s400/fts-graph.pl.gif" border="0" alt="" id="BLOGGER_PHOTO_ID_5457756219536260802" /&gt;&lt;div style="text-align: center;"&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://1.bp.blogspot.com/_eLhn96sA3hw/S73YRzo-jcI/AAAAAAAAAio/BuyMeJ2w0V8/s1600/week.php.png"&gt;&lt;br /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div&gt;&lt;div style="text-align: center;"&gt;&lt;span class="Apple-style-span"  style="color:#0000EE;"&gt;&lt;span class="Apple-style-span" style="text-decoration: underline;"&gt;&lt;br /&gt;&lt;/span&gt;&lt;/span&gt;&lt;/div&gt;We recently ran up a very large backlog of production output files waiting to go from Glasgow back to the Tier-1 (reminder, panda doesn't consider a job finished until the outputs are safely stored at the T1). This is clearly seen in the red line on the panglia plot above, which reaches very high values. As we recently cut the timeout for the UK cloud to 2 days for transferring jobs, to improve the responsiveness of the production system, we started to leak out failed jobs (light green line) as panda gave up and decided to rerun.&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;Fortunately we got a big boost in the number of FTS slots from Glasgow to RAL, increasing from 10 to 25 active transfers (see the bottom FTS monitoring plot). Even so it clearly takes 24 hours for all the backlogs to drain down.&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;One of the problems here is that the output files are small from simulation (a tiny log file and a 20-50MB HITS file), so the overheads of FTS + SRM are very considerable and the actual bandwidth achieved is quite low. One possibility we are considering in ATLAS is introducing a pre-merge of outputs on the T2, which will allow us to send much bigger files back to the T1 (although a final "super-merge" will probably still be necessary). For this we are waiting for the generic Athena merge transform and then we will need to test integrating this into the mainline production workflow.&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;Until then we just have to take the operational load of tweaking the FTS settings when necessary.&lt;br /&gt;&lt;br /&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-4318186857970559630?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/4318186857970559630/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=4318186857970559630' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4318186857970559630'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4318186857970559630'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/04/take-my-outputs-damn-you.html' title='Take my outputs, damn you...'/><author><name>Graeme Stewart</name><uri>http://www.blogger.com/profile/04113191724360870254</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='21' height='32' src='http://www.physics.gla.ac.uk/~graeme/graeme.jpg'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://1.bp.blogspot.com/_eLhn96sA3hw/S73YRzo-jcI/AAAAAAAAAio/BuyMeJ2w0V8/s72-c/week.php.png' height='72' width='72'/><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-7838965688982970839</id><published>2010-04-06T09:40:00.004+01:00</published><updated>2010-04-06T09:45:50.276+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='cream'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><title type='text'>CREAM gets an upgrade</title><content type='html'>The CREAM instance at Glasgow has now been upgraded to the latest SL5 version.  This continues the push to migrate those services that can be moved from SL4 to SL5 and should also make it easier to upgrade to the new 1.6 instance when it is released.  The only hitch to a relatively painless upgrade was cfengine tweaking LCAS and replacing 64 bit path names with 32 bit paths.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-7838965688982970839?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/7838965688982970839/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=7838965688982970839' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/7838965688982970839'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/7838965688982970839'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/04/cream-gets-upgrade.html' title='CREAM gets an upgrade'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-100768133278750817</id><published>2010-04-01T10:32:00.004+01:00</published><updated>2010-04-06T09:40:18.237+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='BDII'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><title type='text'>Où est le site bdii</title><content type='html'>Our upgrade to the SL5 gLite3.2 site bdii has been tormenting me of late as even although the BDII was installed, it was only returning data from a local ldapsearch. &lt;br /&gt;&lt;br /&gt;It was listening on port 2170 and the bdii process was running.  Then when you tried an ldapsearch from a local machine, it worked.  Trying it from a external machine, it could not connect.&lt;br /&gt;&lt;br /&gt;First thought was firewall but iptables was not working.  Then what about campus firewall.  Nope, nothing had changed there.  I checked the configs from SL4 to SL5 and they were the same.  I turned on logging for slapd and turned up the verbosity.  You could then see the DENY's being made by slapd itself.&lt;br /&gt;&lt;br /&gt;After much googling I tried slapd in /etc/hosts.allow and this worked!  It looks like with the transition to SL5 there is a requirement to add the slapd service to hosts.allow.  This looks to have been a bug with openldap in SL4.&lt;br /&gt;&lt;br /&gt;With the site bdii upgraded the change over occurred yesterday.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-100768133278750817?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/100768133278750817/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=100768133278750817' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/100768133278750817'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/100768133278750817'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/04/ou-est-le-site-bdii.html' title='Où est le site bdii'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-8302679307394555733</id><published>2010-03-26T11:54:00.005Z</published><updated>2010-03-26T12:04:16.208Z</updated><category scheme='http://www.blogger.com/atom/ns#' term='APEL'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><category scheme='http://www.blogger.com/atom/ns#' term='Accounting'/><title type='text'>'EventRecords' is full</title><content type='html'>Our accounting database appears to be full.&lt;br /&gt;&lt;pre&gt;org.glite.apel.core.ApelException: java.sql.SQLException: The table 'EventRecords' is full&lt;/pre&gt; Hmmm, what to do.  Increase or archive?&lt;br /&gt;&lt;br /&gt;You can see what is set from: &lt;tt&gt;SHOW TABLE STATUS FROM accounting LIKE 'EventRecords';&lt;/tt&gt;&lt;br /&gt;&lt;br /&gt;and if you want to increase you can use:&lt;tt&gt;ALTER TABLE accounting MAX_ROWS=1000000000 AVG_ROW_LENGTH=338;&lt;/tt&gt;&lt;br /&gt;&lt;br /&gt;But surely the correct thing would be archive. Handily the archival procedure is documented on the&lt;a href="http://goc.grid.sinica.edu.tw/gocwiki/ApelFaq#head-956e1f2eb88d37271361f0c5e5fc9de9f79622af"&gt; APEL wiki.&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;It is useful to know that the default size of MyISAM tables in MYSQL4 is 4Gb.  Luckily in MYSQL5 and above the table limit is much higher.   I wonder if the new SL5 APEL will ship with innodb tables?&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-8302679307394555733?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/8302679307394555733/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=8302679307394555733' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/8302679307394555733'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/8302679307394555733'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/03/eventrecords-is-full.html' title='&apos;EventRecords&apos; is full'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-8129853435256036129</id><published>2010-03-18T09:51:00.003Z</published><updated>2010-03-18T10:16:43.169Z</updated><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><category scheme='http://www.blogger.com/atom/ns#' term='maui'/><title type='text'>Corralling jobs in Maui.</title><content type='html'>Sometimes, when testing new hardware or software in a limited way, it is important to be able to arrange lightweight, temporary partitions of a cluster for only a given user.&lt;div&gt;Now, you could repartition the cluster nodes between a "normal" partition and a "testing" partition, but for most pbs/maui clusters (which don't have anything but the 'ALL' partition set), this involves changing configuration for &lt;b&gt;all&lt;/b&gt; the nodes, rather than simply the nodes we care about. (And then changing it back when you're finished.)&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;You might also consider doing this with reservations - indeed, the maui manual suggests that a reservation locked to a user specified with an &amp;amp; prefix will force precisely the behaviour we want - locking the reservation and the user together. This appears not to work under empirical testing.&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;Instead, the solution we've found to work is (all in &lt;code&gt;maui.cfg&lt;/code&gt;):&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;&lt;ol&gt;&lt;li&gt;Create a reservation for the user only.&lt;br /&gt;&lt;code&gt;SRCFG[ssdnodes] PERIOD=INFINITY&lt;br /&gt;SRCFG[ssdnodes] STARTTIME=00:00:00 ENDTIME=24:00:00&lt;br /&gt;SRCFG[ssdnodes] HOSTLIST=node30[0-9]&lt;br /&gt;SRCFG[ssdnodes] USERLIST=ssp001&lt;br /&gt;&lt;/code&gt;&lt;/li&gt;&lt;li&gt;Create a quality of service class with the property that it only runs on that reservation.&lt;br /&gt;&lt;code&gt;QOSCFG[ssd]     QFLAGS=USERESERVED:ssdnodes&lt;/code&gt;&lt;/li&gt;&lt;li&gt;Make the user a member of that quality of service class &lt;b&gt;only&lt;/b&gt;.&lt;br /&gt;&lt;code&gt;USERCFG[ssp001] QDEF=ssd QLIST=ssd&lt;/code&gt;&lt;/li&gt;&lt;/ol&gt;&lt;div&gt;(In this case, the configuration mutually restricts the user &lt;code&gt;ssp001&lt;/code&gt; and the nodes &lt;code&gt;node300&lt;/code&gt; to &lt;code&gt;node309&lt;/code&gt; to each other.)&lt;/div&gt;&lt;div&gt;This has the benefit that it also generalises to any number of users, as long as you add them to the reservation and the QoS class.&lt;/div&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-8129853435256036129?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/8129853435256036129/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=8129853435256036129' title='3 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/8129853435256036129'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/8129853435256036129'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/03/corralling-jobs-in-maui.html' title='Corralling jobs in Maui.'/><author><name>Sam Skipsey</name><uri>http://www.blogger.com/profile/10165998351125446764</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>3</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-4621653042084686584</id><published>2010-03-12T10:23:00.005Z</published><updated>2010-03-12T11:26:59.412Z</updated><title type='text'>SSDs - the testing begins!</title><content type='html'>This Monday (finally!) we received (half) of the SSDs we ordered for our storage testing plans.&lt;div&gt;These are the Intel G2 X-25s which are intended to represent the mid-range of the SSDs available currently (the low end ones are still due to arrive, and our high end card is being tested differently).&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;Just as a sneak preview, we had a chance to run iozone against one of the X-25s, in the same configuration as I've previously run against our newer disk servers (in RAID6 mode). As you can see from the graphs below, the SSDs behave exactly as we'd expect - the throughput is almost identical on random or ordered reads, whilst the RAID array suffers significantly from having to seek. Indeed, although the 22 drives in the array give it much better read performance when not seeking, the single X25 seems to equal the RAID array's performance when seeking is needed...&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://3.bp.blogspot.com/_cPmxvuJqJHY/S5ochZ2Rd1I/AAAAAAAAADA/pmI7oKPbUxg/s1600-h/Screen+shot+2010-03-12+at+10.47.49.png"&gt;&lt;img style="cursor:pointer; cursor:hand;width: 320px; height: 211px;" src="http://3.bp.blogspot.com/_cPmxvuJqJHY/S5ochZ2Rd1I/AAAAAAAAADA/pmI7oKPbUxg/s320/Screen+shot+2010-03-12+at+10.47.49.png" border="0" alt="" id="BLOGGER_PHOTO_ID_5447698059457820498" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://3.bp.blogspot.com/_cPmxvuJqJHY/S5ocYkootoI/AAAAAAAAAC4/FcSmepxVC74/s1600-h/Screen+shot+2010-03-12+at+10.47.20.png"&gt;&lt;img style="cursor:pointer; cursor:hand;width: 320px; height: 212px;" src="http://3.bp.blogspot.com/_cPmxvuJqJHY/S5ocYkootoI/AAAAAAAAAC4/FcSmepxVC74/s320/Screen+shot+2010-03-12+at+10.47.20.png" border="0" alt="" id="BLOGGER_PHOTO_ID_5447697907734591106" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;&lt;br /&gt;Next thing on the list is testing them in Worker nodes against Analysis and Production workloads.&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-4621653042084686584?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/4621653042084686584/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=4621653042084686584' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4621653042084686584'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4621653042084686584'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/03/ssds-testing-begins.html' title='SSDs - the testing begins!'/><author><name>Sam Skipsey</name><uri>http://www.blogger.com/profile/10165998351125446764</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://3.bp.blogspot.com/_cPmxvuJqJHY/S5ochZ2Rd1I/AAAAAAAAADA/pmI7oKPbUxg/s72-c/Screen+shot+2010-03-12+at+10.47.49.png' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-7300395629109743372</id><published>2010-03-12T09:39:00.002Z</published><updated>2010-03-12T09:51:38.967Z</updated><category scheme='http://www.blogger.com/atom/ns#' term='LHCb'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><title type='text'>LHCb Production Failures</title><content type='html'>Over the last week we have been investigating why we have around 50% failure rate with LHCb jobs.  All seem to be failing with the same issue which is sometimes not being able to copy their results back to the Tier 0 or subsequent fail-over Tier 1 site. This is not strictly just a Glasgow issue and it has affected Sheffield and Brunel, although the issue appears to have gone away from Brunel.&lt;br /&gt;&lt;br /&gt;We have tried pretty much everything, as simple lcg-ls and lcg-cp actually work from the worker nodes so its not a certificate issue.  The failures are not particular to a CE.  Nothing changed at our site prior to the failure and LHCb say nothing changed at their end. In fact they have sites in the UK such as Manchester working fine.  &lt;br /&gt;&lt;br /&gt;None of the failures correspond to a particular set of worker nodes which might indicate NAT issues for us as we split our odd and even nodes through separate NAT's.  However, it does look like network contention at some point in the process as we see either broken pipes or timeouts in the logs direct from Globus.  &lt;br /&gt;&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;2010-03-04 04:04:56 UTC dirac-jobexec.py ERROR: SRM2Storage.__putFile: Failed to put file to storage. file:/tmp/8230840/CREAM603030715/7472318/00005987_00009161_3.dst: globus_xio: System error in writev: Broken pipe&lt;br /&gt;2010-03-04 04:04:56 UTC dirac-jobexec.py ERROR: globus_xio: A system call failed: Broken pipe&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;  &lt;br /&gt;The only constant so far is that their appears to be a 50% failure rate from failed uploads which happens consistently from submissions from DIRAC.  &lt;br /&gt;&lt;br /&gt;Its certainly a puzzler and we are fast running out of ideas!&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-7300395629109743372?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/7300395629109743372/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=7300395629109743372' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/7300395629109743372'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/7300395629109743372'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/03/lhcb-production-failures.html' title='LHCb Production Failures'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-1852920583285205836</id><published>2010-03-12T09:30:00.004Z</published><updated>2010-03-12T09:39:43.361Z</updated><category scheme='http://www.blogger.com/atom/ns#' term='NAT'/><category scheme='http://www.blogger.com/atom/ns#' term='camont'/><category scheme='http://www.blogger.com/atom/ns#' term='LHCb'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><title type='text'>NATs Maxing Out</title><content type='html'>During our investigation of our LHCb failures we noticed that our number of conntrack entries on our two NAT hosts were in fact being totally used up i.e. all 43200!  By looking at /proc/net/ip_conntrack we noticed that most of the connections were in fact udp DNS lookups by Camont jobs.  We also noticed that we had not changed the default timeouts, 32768 for tcp and 3600 for udp.   This was probably the reason they were being used up.  So we have tweaked the timeouts and increased the maximum.&lt;br /&gt;So our new NAT settings look like this:&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;/etc/sysctl.conf&lt;br /&gt;original values of 43200, 32768, 3600 respectively.&lt;br /&gt;net.ipv4.netfilter.ip_conntrack_tcp_timeout_established = 21600&lt;br /&gt;net.ipv4.netfilter.ip_conntrack_max = 65536&lt;br /&gt;net.ipv4.netfilter.ip_conntrack_udp_timeout = 30&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;Now out NAT's look much healthier.  Only problem - it didn't help with LHCb productions jobs not being able to upload their results back to CERN.  Back to the drawing board.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-1852920583285205836?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/1852920583285205836/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=1852920583285205836' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/1852920583285205836'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/1852920583285205836'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/03/nats-maxing-out.html' title='NATs Maxing Out'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-1669405789285123001</id><published>2010-03-01T15:12:00.006Z</published><updated>2010-03-01T15:28:50.629Z</updated><category scheme='http://www.blogger.com/atom/ns#' term='voms'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><category scheme='http://www.blogger.com/atom/ns#' term='Users'/><title type='text'>local users before pool users</title><content type='html'>Further to the original post by Graeme &lt;a href="http://scotgrid.blogspot.com/2008/02/to-voms-or-not-to-voms-that-is-question.html"&gt;'to voms or not to voms'&lt;/a&gt;.  The Nikhef documentation has been thoroughly overhauled and I have now been able to switch lcmaps in CREAM and SCAS over to use local unix group mappings before pool accounts, if they exist.&lt;br /&gt;&lt;br /&gt;The main changes are changing localaccount to pull in the glasgow centric grid-mapfile. &lt;br /&gt;&lt;pre&gt;&lt;br /&gt;localaccount = "lcmaps_localaccount.mod"&lt;br /&gt;               " -gridmapfile /usr/local/etc/grid-mapfile-local"&lt;br /&gt;              # " -gridmapfile /etc/grid-security/grid-mapfile"&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;Some small tweaks are required to move localaccount from the last check to the first check.  If this is successful it uses that account, otherwise it moves to check voms and pool accounts.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;glexec_get_account:&lt;br /&gt;proxycheck -&gt; localaccount&lt;br /&gt;localaccount -&gt; good | vomslocalgroup&lt;br /&gt;#proxycheck -&gt; vomslocalgroup&lt;br /&gt;vomslocalgroup -&gt; vomspoolaccount | poolaccount&lt;br /&gt;vomspoolaccount -&gt; good | vomslocalaccount&lt;br /&gt;vomslocalaccount -&gt; good | poolaccount&lt;br /&gt;poolaccount -&gt; good #| localaccount&lt;br /&gt;&lt;br /&gt;glexec_verify_account:&lt;br /&gt;proxycheck -&gt; localaccount&lt;br /&gt;localaccount -&gt; good | vomslocalgroup&lt;br /&gt;#proxycheck -&gt; vomslocalgroup&lt;br /&gt;vomslocalgroup -&gt; vomspoolaccount | poolaccount&lt;br /&gt;vomspoolaccount -&gt; good | vomslocalaccount&lt;br /&gt;vomslocalaccount -&gt; good | poolaccount&lt;br /&gt;poolaccount -&gt; good #| localaccount&lt;br /&gt;&lt;/pre&gt; &lt;br /&gt;SCAS is works in the same way and all that is required is to change the localaccount setting to pull in our Glasgow local grid-mapfile a'la&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;localaccount = "lcmaps_localaccount.mod"&lt;br /&gt;                " -gridmapfile /usr/local/etc/grid-mapfile-local"&lt;br /&gt;#               " -gridmapfile /etc/grid-security/grid-mapfile"&lt;br /&gt;&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;Job done.  I can now flit between gla or pool accounts depending on my existence in &lt;tt&gt;/usr/local/etc/grid-mapfile-local&lt;/tt&gt;&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;Job id                    Name             User            Time Use S Queue&lt;br /&gt;------------------------- ---------------- --------------- -------- - -----&lt;br /&gt;2013.svr008               cream_441636610  ssp001                 0 R q2d            &lt;br /&gt;2014.svr008               cream_963867097  gla057                 0 Q q2d&lt;br /&gt;&lt;/pre&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-1669405789285123001?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/1669405789285123001/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=1669405789285123001' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/1669405789285123001'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/1669405789285123001'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/03/further-to-original-post-by-graeme-to.html' title='local users before pool users'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-2883505145667549169</id><published>2010-03-01T13:48:00.002Z</published><updated>2010-03-01T13:57:02.165Z</updated><category scheme='http://www.blogger.com/atom/ns#' term='VM'/><category scheme='http://www.blogger.com/atom/ns#' term='vmware'/><category scheme='http://www.blogger.com/atom/ns#' term='SL5.4'/><title type='text'>VMware Web admin vs SL5.4: fight!</title><content type='html'>Recently, we've acquired some hefty servers for the purposes of running virtual machines (initially for test purposes and cheap dev boxes, but potentially for service hosting depending on how well it goes). We're using VMWare Server, which, although it comes with some command line tools, very much wants you to use the fancy web interface that it runs on non-standard ports.&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;This was fine, except that it seemed extremely flaky on all our test servers - randomly crashing, sometimes taking out a running VM with it. &lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;It turns out that this is all the fault of our running an up-to-date version of SL. SL5.4 (actually, anything based on RHEL5.4, one assumes) has a version of glibc that VMWare really doesn't get on with well. &lt;/div&gt;&lt;div&gt;Once we copied the 5.3 release of libc.so.6 from a 5.3 server into a suitable place, and pointed VMware's LD_LIBRARY_PATH at it, it seems much more stable.&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;(The relevant bug report, including fix suggestions is:&lt;/div&gt;&lt;div&gt;http://bugs.centos.org/view.php?id=3884 )&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-2883505145667549169?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/2883505145667549169/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=2883505145667549169' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/2883505145667549169'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/2883505145667549169'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/03/vmware-web-admin-vs-sl54-fight.html' title='VMware Web admin vs SL5.4: fight!'/><author><name>Sam Skipsey</name><uri>http://www.blogger.com/profile/10165998351125446764</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-997554338917709243</id><published>2010-02-17T15:18:00.002Z</published><updated>2010-02-17T16:02:32.206Z</updated><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><category scheme='http://www.blogger.com/atom/ns#' term='MPI'/><category scheme='http://www.blogger.com/atom/ns#' term='maui'/><category scheme='http://www.blogger.com/atom/ns#' term='OPENMPI'/><title type='text'>more openmpi tweaking</title><content type='html'>Whilst testing MPI on our cluster and get it into a usable state I uncovered a rather nasty bug with openmpi-1.3.4.  This manifested itself with never being able to run on the node with cores &gt; 4. It was a weird one as openmpi communication over two nodes worked fine with 8 cores on each node but when a job requested cores &gt; 4 on the same node.  The job just hung.  An strace of the mpiexec process suggested some sort of TIMEOUT/WAIT issue. &lt;br /&gt;&lt;br /&gt;On the release note for openmpi-1.4.1 it appears they discovered this bug and provided a fix:&lt;br /&gt;- Fix a shared memory "hang" problem that occurred on x86/x86_64&lt;br /&gt;  platforms when used with the GNU &gt;=4.4.x compiler series. &lt;br /&gt;&lt;br /&gt;This sounded plausible and in fact an upgrade has fixed the issue.  &lt;br /&gt;&lt;br /&gt;So now with all 8 cores running on the same node the next issue to arise was one related to Maui.  Some time when you requested nodes=8, Maui scheduled the job on 3 cores, a qdel and a resubmission later Maui rescheduled the job onto 5 cores.  On one test I even qrun'd the job and it appeared it start on the correct number of nodes but there appeared to be no reason for Maui not getting this correct.  So it was time to get out Maui docs.&lt;br /&gt;&lt;br /&gt;from the docs;&lt;br /&gt;Maui is by default very liberal in its interpretation of &lt;NODECOUNT&gt;:PPN=&lt;X&gt;.  In its standard configuration, Maui interprets this as 'give the job &lt;NODECOUNT&gt;*&lt;X&gt; tasks with AT LEAST &lt;X&gt; tasks per node'.  Set the JOBNODEMATCHPOLICY parameter to EXACTNODE to have Maui support PBS's default allocation behavior of &lt;NODECOUNT&gt; nodes with exactly &lt;X&gt; tasks per node. &lt;br /&gt;&lt;br /&gt;This seemed to suggest that Maui's default behaviour is to pack a job into as few nodes as possible.  So I tried out setting the JOBNODEMATCHPOLICY to EXACTNODE and this seems to have done the trick. &lt;br /&gt;&lt;br /&gt;nodes=24 means 24 nodes, not 8, not 6 but 24&lt;br /&gt;&lt;br /&gt;This does have a drawback in that it will be 24 separate nodes.  This setting relies upon being able to set :ppn (processes per node) to allow nodes=3:ppn=8 giving 24 cores which is really what you want to say.  As you probably have a fast machine with loads of memory and cores.  Therefore, you could target all the cores rather than 24 nodes.  However, it is a start.&lt;br /&gt;&lt;br /&gt;Wouldn't it be nice if you could specify :ppn in JDL.  The only way round this I can see for now is to manually change the job manager or use the local batch attributes of CREAM to allow a custom cerequirement to be specified.  Possible but not nice.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-997554338917709243?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/997554338917709243/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=997554338917709243' title='2 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/997554338917709243'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/997554338917709243'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/02/more-openmpi-tweaking.html' title='more openmpi tweaking'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>2</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-2782627460372864665</id><published>2010-02-16T11:12:00.006Z</published><updated>2010-02-17T15:17:44.298Z</updated><category scheme='http://www.blogger.com/atom/ns#' term='CE'/><category scheme='http://www.blogger.com/atom/ns#' term='cream'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><title type='text'>cream sours</title><content type='html'>Well we now know how much it takes to kill our CREAM instance.  Yesterday it stoppped working completely and it appeared to be caught in a tailspin with the Lease and Proxy Renew processes within CREAM.  Grepping the logs indicated that most of the Renewals and Lease Manager entries were all related to condor submission from ATLAS.&lt;br /&gt;&lt;br /&gt;From speaking to Massimo at INFN it was described how Proxy and Lease renewals are operations which are executed with higher priorities wrt other commands. One hypothesis might be that the CREAM CE was so overloaded doing these commands that it was unable to deal with basic job submission since all the test jobs I submitted never made it out out the REGISTERED state.&lt;br /&gt;&lt;br /&gt;It looked bad on Ganglia: &lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://3.bp.blogspot.com/_k7p-Ualo6NQ/S3wHhMWMMkI/AAAAAAAAAAM/_mGX5EOCAlk/s1600-h/Screen+shot+2010-02-17+at+14.59.38.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 320px; height: 117px;" src="http://3.bp.blogspot.com/_k7p-Ualo6NQ/S3wHhMWMMkI/AAAAAAAAAAM/_mGX5EOCAlk/s320/Screen+shot+2010-02-17+at+14.59.38.png" border="0" alt=""id="BLOGGER_PHOTO_ID_5439230716788093506" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;The first course of action was to disable job submission using the command line tool: glite-ce-disable-submission and try to deal with the renewals.  This worked for a time but they reoccurred later on that evening.&lt;br /&gt;&lt;br /&gt;The timestamps on these ATLAS cream jobs seems to be very old and hinted at stale jobs so the next course of action was to manually purge the database using the tool provided by the CREAM developers: &lt;a href="http://grid.pd.infn.it/cream/field.php?n=Main.HowToPurgeJobsFromTheCREAMDB"&gt;here&lt;/a&gt;.  The easiest way I could see to do this was to connect to the creamdb, select out the id's and create a script that called the purger for each id.  Note: you need jdk 1.6 in order to run the purger!&lt;br /&gt;&lt;br /&gt;This ended up removing around 3000 CREAM entries.&lt;br /&gt;&lt;br /&gt;Ganglia looked much happier:&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://2.bp.blogspot.com/_k7p-Ualo6NQ/S3wHswUEmYI/AAAAAAAAAAU/FxvgLdch1hU/s1600-h/Screen+shot+2010-02-17+at+14.59.50.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 320px; height: 119px;" src="http://2.bp.blogspot.com/_k7p-Ualo6NQ/S3wHswUEmYI/AAAAAAAAAAU/FxvgLdch1hU/s320/Screen+shot+2010-02-17+at+14.59.50.png" border="0" alt=""id="BLOGGER_PHOTO_ID_5439230915421444482" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;So I think you have to be careful when getting submissions from Condor at the moment as it looks to be quite easy to denial of service your CREAM CE.&lt;br /&gt;&lt;br /&gt;Roll on CREAM 1.6&lt;br /&gt;&lt;br /&gt;- That proxy renewal is not very efficient in the release now in production (already addressed in the coming CREAM CE: see &lt;a href="http://savannah.cern.ch/bugs/?51993"&gt;here&lt;/a&gt;)&lt;br /&gt;- When there are too many pending commands, new job submissions will be disabled by the limiter: see &lt;a href="http://grid.pd.infn.it/cream/field.php?n=Main.Self-limitingCREAMBehavior"&gt;here&lt;/a&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-2782627460372864665?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/2782627460372864665/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=2782627460372864665' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/2782627460372864665'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/2782627460372864665'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/02/cream-sours.html' title='cream sours'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://3.bp.blogspot.com/_k7p-Ualo6NQ/S3wHhMWMMkI/AAAAAAAAAAM/_mGX5EOCAlk/s72-c/Screen+shot+2010-02-17+at+14.59.38.png' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-6433385396560740450</id><published>2010-01-29T09:29:00.005Z</published><updated>2010-01-29T13:08:07.364Z</updated><category scheme='http://www.blogger.com/atom/ns#' term='VM'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><category scheme='http://www.blogger.com/atom/ns#' term='UI'/><title type='text'>ScotGrid's shrink wrapped UI</title><content type='html'>In an effort to reduce overhead for new external users who wish to submit to Glasgow I have created a shrink wrapped gLite UI.  This comes in the form of a slimmed down Virtual Box SL5 image with pre-installed gLite UI.&lt;br /&gt;&lt;br /&gt;The hope is that users from external institutions who wish to run jobs on the EGEE grid and more specifically at ScotGrid will be able to take advantage of this.  This is of particular importance for external users of Lumerical's FDTD who are primarily engineers who just want to run the software rather than install an SL5 gLite UI first.  The end goal is extending this to help all our users get up and running as quickly as possible. &lt;br /&gt;&lt;br /&gt;This will come pre-installed with Glasgow's submission tools such as &lt;a href="https://www.scotgrid.ac.uk/wiki/index.php/Gqsub_Quickstart_Guide"&gt;gqsub&lt;/a&gt; and other more specific user scripts.  Those wishing a link to download the VM should drop us an email.&lt;br /&gt;&lt;br /&gt;Details of the &lt;a href="https://www.scotgrid.ac.uk/wiki/index.php/ScotGrid_Virtual_Box_VM"&gt;VM image&lt;/a&gt;, &lt;a href="https://www.scotgrid.ac.uk/wiki/index.php/Glite_UI_Install_Instructions"&gt;setting up the UI&lt;/a&gt; are available at the wiki.  &lt;br /&gt;&lt;br /&gt;I am still at a loss how CERN managed to get their VirtualBox image down to 500GB!&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-6433385396560740450?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/6433385396560740450/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=6433385396560740450' title='3 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/6433385396560740450'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/6433385396560740450'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/01/scotgrids-shrink-wrapped-ui.html' title='ScotGrid&apos;s shrink wrapped UI'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>3</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-3664239111552327482</id><published>2010-01-19T09:48:00.003Z</published><updated>2010-01-20T11:31:51.901Z</updated><title type='text'>pick a torque, any torque</title><content type='html'>Since our seg-faulting mom issue during our SL5 upgrade using 2.3.6 server &amp; mom I have compiled a variety of Torque versions of late and trialled them out. I have now come to some conclusion and am sticking with the 2.3.* series.  2.3.9 at the moment - well until another bug is found!&lt;br /&gt;&lt;br /&gt;2.3 Series &lt;br /&gt;&lt;br /&gt;2.3.6 - seg-faulting mom during some unidentifiable race condition&lt;br /&gt;2.3.7 - untested&lt;br /&gt;2.3.8 - Operators/Managers Lists Bug&lt;br /&gt;2.3.9 - Seems stable&lt;br /&gt;&lt;br /&gt;2.4 Series - Beta&lt;br /&gt;&lt;br /&gt;2.4.2 - OSC MPIEXEC Bug&lt;br /&gt;2.4.3 - OSC MPIEXEC Bug Fixed &amp; Operators/Managers Lists Bug&lt;br /&gt;2.4.4 - OSC MPIEXEC Bug Back in&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-3664239111552327482?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/3664239111552327482/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=3664239111552327482' title='11 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/3664239111552327482'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/3664239111552327482'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/01/pick-torque-any-torque.html' title='pick a torque, any torque'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>11</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-7736137194535964183</id><published>2010-01-12T11:51:00.004Z</published><updated>2010-01-12T12:59:00.972Z</updated><category scheme='http://www.blogger.com/atom/ns#' term='WMS'/><title type='text'>Leaks caused by frozen ICE</title><content type='html'>We had a rather quiet time here over winter - a slight hiccough with a disk server, but all rather stable.  Other than that, the big freeze didn't result in much.&lt;br /&gt;&lt;br /&gt;Except for the ice freezing up, and causing leaky pipes.&lt;br /&gt;&lt;br /&gt;That's ICE - the WMS plugin that submits to CREAM.  It turns out that it can break the pipes, and leak bits of past jobs.  This resulted in an error message like:&lt;br /&gt;&lt;blockquote&gt;&lt;br /&gt;Warning - Unable to submit the job to the service: https://svr022.gla.scotgrid.ac.uk:7443/glite_wms_wmproxy_server&lt;br /&gt;System load is too high:&lt;br /&gt;(Not all processes could be identified, non-owned process info will not be shown, you would have to be root to see it all.)&lt;br /&gt;Threshold for ICE Input JobDir jobs: 1500 =&gt; Detected value for ICE Input JobDir jobs /var/glite/ice/jobdir : 1514&lt;br /&gt;&lt;/blockquote&gt;from both WMSen.  In principle, this is reasonable: it's saying that the WMS is loaded up, so no more jobs for the moment.  A decent way of ensuring running jobs are not harmed by new submissions, in the event the system explodes.&lt;br /&gt;&lt;br /&gt;Except the system load was the lowest I've seen on them, under 1.0.  Dug dug into the underlying Condor instance, which had only a few jobs in it, and the hunt commenced for the 1500 phantom jobs.&lt;br /&gt;&lt;br /&gt;As the error message suggests, /var/glite/ice/jobdir/old had 1514 files in it, each one representing a past job.  However, most of these were old - over a month old.  Given that the WMS is supposed to purge the jobs after a month (if they user doesn't do it earlier), that shouldn't have been the case.&lt;br /&gt;&lt;br /&gt;Derek down at RAL confirmed this - it's apparently a known bug; but I can't quite see it on the &lt;a href="http://glite.web.cern.ch/glite/packages/R3.1/deployment/glite-WMS/glite-WMS-known-issues.asp"&gt;gLite Known issues page&lt;/a&gt;.  It looks like most of the UK's WMS's fell to this at the same time.  I think that's due to the increased number of CREAM CE's (so the rate of use of ICE is climbing), and the fail over on the clients if one WMS is down - resulting in a nice, even distribution of failure.&lt;br /&gt;&lt;br /&gt;In the end, the fix was simple - I moved the files older than a month out of /var/glite/ice/jobdir/old.  Deletion aught to be safe, but they're tiny.  I'll need to automate that, until such time as the bug is fixed - but also need to watch in case the usage increases further, and 1500 isn't enough to last out a month of use.  In that case, I think I'd probably temporily increase the&lt;br /&gt;limit on the WMS (I believe it's in a configuration file), knowing that most of them are stale phantoms.&lt;br /&gt;&lt;br /&gt;The only discussion I can find related to that error message resulted in pointing the finger at the glite-wms-ice-safe process.  ICE has two processes, and it appears that the ice-safe is the part responsible for cleaning up.  However, as far as I can tell, both processes are running on each of our WMS's, so this appears to be a different case from the previous one.  It might have been the case that the ice-safe process died, and when it's restarted it's not removing old jobs?  I don't know - if I find out I'll update here.&lt;br /&gt;&lt;br /&gt;The purpose of this post is to get the error message from the WMS into google, and on the same page as something that talks about the issue; and resolution.  In case it freezes up on us again.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-7736137194535964183?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/7736137194535964183/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=7736137194535964183' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/7736137194535964183'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/7736137194535964183'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2010/01/leaks-caused-by-frozen-ice.html' title='Leaks caused by frozen ICE'/><author><name>Stuart Purdie</name><uri>http://www.blogger.com/profile/08473287949581285669</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-7700444616449473350</id><published>2009-12-14T14:19:00.002Z</published><updated>2009-12-14T14:24:04.491Z</updated><title type='text'>(Almost) 100% Success for Glasgow!</title><content type='html'>&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://1.bp.blogspot.com/_cPmxvuJqJHY/SyZJzZXTLgI/AAAAAAAAACs/8bGMQEXrT6c/s1600-h/IMAG0024.jpg"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 320px; height: 214px;" src="http://1.bp.blogspot.com/_cPmxvuJqJHY/SyZJzZXTLgI/AAAAAAAAACs/8bGMQEXrT6c/s320/IMAG0024.jpg" border="0" alt="" id="BLOGGER_PHOTO_ID_5415096749290368514" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;div&gt;We noticed today that (apart from that pesky red mark for CMS over the past 6 months, and some yellow on the WMS tests) we're looking incredibly green and functional on the Glasgow Dashboard at the moment. So, we took a picture before that changed...&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;(The Glasgow Dashboard is Mike's mashup of all the useful metrics on the web concerning UKI-SCOTGRID-GLASGOW, now over two, alternating, pages. It's actually quite useful, and remarkably festive at this time of year.) &lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-7700444616449473350?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/7700444616449473350/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=7700444616449473350' title='2 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/7700444616449473350'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/7700444616449473350'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/12/almost-100-success-for-glasgow.html' title='(Almost) 100% Success for Glasgow!'/><author><name>Sam Skipsey</name><uri>http://www.blogger.com/profile/10165998351125446764</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://1.bp.blogspot.com/_cPmxvuJqJHY/SyZJzZXTLgI/AAAAAAAAACs/8bGMQEXrT6c/s72-c/IMAG0024.jpg' height='72' width='72'/><thr:total>2</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-2243244095690254641</id><published>2009-12-10T11:22:00.004Z</published><updated>2009-12-10T11:57:37.465Z</updated><category scheme='http://www.blogger.com/atom/ns#' term='CASTEP'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><category scheme='http://www.blogger.com/atom/ns#' term='MPI'/><category scheme='http://www.blogger.com/atom/ns#' term='OPENMPI'/><title type='text'>openmpi magic</title><content type='html'>I have just rebuilt openmpi-1.3.4 for use with CASTEP.  This is built to a useful /opt location with Torque, F90 support for gfortran44.&lt;br /&gt;&lt;br /&gt;gLite support for OPENMPI is fairly generic and means any openmpi rpm install does not have any useful batch or interconnect support.  So anything out of the ordinary requires a custom build.&lt;br /&gt;&lt;br /&gt;I will stick the RPM available for download &lt;a href="http://ppewww.physics.gla.ac.uk/~dmcnab/index.html"&gt;here&lt;/a&gt; in the next few days.&lt;br /&gt;&lt;br /&gt;The magic for building from a src rpm:&lt;br /&gt;&lt;code&gt;&lt;br /&gt;cd /usr/src/redhat/SPECS/&lt;br /&gt;rpmbuild -ba --define '_prefix /opt/openmpi-1.3.4' --define '_mandir %{_prefix}/share/man' &lt;br /&gt;--define 'configure_options --prefix=/opt/openmpi-1.3.4 --with-tm=/usr/ &lt;br /&gt;FC=gfortran44 F77=gfortran44 CC=gcc44 CXX=g++44 FFLAGS=-O2 FCFLAGS=-O2 CFLAGS=-O2 CXXFLAGS=-O2' &lt;br /&gt;openmpi-1.3.4.spec&lt;br /&gt;&lt;/code&gt;&lt;br /&gt;Full Instructions are &lt;a href="https://www.scotgrid.ac.uk/wiki/index.php/Building_OPENMPI"&gt;here&lt;/a&gt;.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-2243244095690254641?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/2243244095690254641/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=2243244095690254641' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/2243244095690254641'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/2243244095690254641'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/12/openmpi-magic.html' title='openmpi magic'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-1060966028655020376</id><published>2009-12-08T11:03:00.008Z</published><updated>2009-12-08T15:31:07.291Z</updated><category scheme='http://www.blogger.com/atom/ns#' term='CASTEP'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><category scheme='http://www.blogger.com/atom/ns#' term='MPI'/><title type='text'>issues with gfortran43/44 and mpich</title><content type='html'>I am finally getting to the bottom of what has been going wrong with re-compiling MPICH for F90/F95 (required for CASTEP -  a demanding Fortran code).  I have now narrowed it down to one issue between recompiling MPICH with gfortran43/44 for SL5 usage with CASTEP instead of plain old gfortran.&lt;br /&gt;&lt;code&gt;&lt;br /&gt;FC="gfortran44" ; export FC;&lt;br /&gt;F90="gfortran44" ; export F90; &lt;br /&gt;...&lt;br /&gt;--enable-f90modules&lt;br /&gt;&lt;/code&gt;&lt;br /&gt;The SAM MPICH test runs after recompiling with F90 support using gfortran and in fact it works fine on SL4 and SL5.  So that was not the issue.&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;message size            transfertime            bandwidth&lt;br /&gt;32 bytes                0.000000 sec            inf MB/s&lt;br /&gt;2048 bytes              0.000117 sec            17.476267 MB/s&lt;br /&gt;131072 bytes            0.001445 sec            90.687654 MB/s&lt;br /&gt;8388608 bytes           0.078437 sec            106.946397 &lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;It turns out that MPICH just doesn't work when compiled with gfortran43/44.  Leaving me in a bit of an pickle as CASTEP will not compile on SL5 with gfortran, you have to use gfortran43/44!&lt;br /&gt;&lt;br /&gt;Time for the backup plan ..... openmpi.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-1060966028655020376?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/1060966028655020376/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=1060966028655020376' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/1060966028655020376'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/1060966028655020376'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/12/issues-with-gfortran4344-and-mpich.html' title='issues with gfortran43/44 and mpich'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-4180498142231712187</id><published>2009-12-03T11:08:00.006Z</published><updated>2009-12-04T11:13:31.961Z</updated><category scheme='http://www.blogger.com/atom/ns#' term='SCAS'/><category scheme='http://www.blogger.com/atom/ns#' term='glexec'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><title type='text'>lightening testing of glexec with SCAS</title><content type='html'>Well since it is looking increasingly lightly that we will be moving to some form of identity switching at our sites to give us more information about who is running their jobs via their pilot frameworks.  I thought I would give it whirl.&lt;br /&gt;&lt;br /&gt;So in some lightening tests, a phrase I am stealing from lightening talks sometimes given at technical conferences, I am trialling glexec for identity switching coupled with SCAS for centralised allow/deny decisions.&lt;br /&gt;&lt;br /&gt;Here is what was tested:&lt;br /&gt;&lt;br /&gt;an install of SCAS&lt;br /&gt;and install and test GLEXEC with SCAS on LCG-CE&lt;br /&gt;and install and test GLEXEC with SCAS on CREAM [1]&lt;br /&gt;and install and test GLEXEC on WN (SL4)&lt;br /&gt;and install and test GLEXEC on WN (SL5)&lt;br /&gt;&lt;br /&gt;Detailed Instructions and Results can be found &lt;a href="https://www.scotgrid.ac.uk/wiki/index.php/Glasgow_GLite_gLExec_installation_and_configuration"&gt;here&lt;/a&gt; &lt;br /&gt;&lt;br /&gt;The short and long of it is that it is very easy to set-up SCAS and use it on whatever service you want.  So easy infact that once you SCAS server is up and running you cn direct calls to it from your CE's in a matter of minutes.  glexec on the WN is just as easy, all that remains would be for someone to use it.&lt;br /&gt;&lt;br /&gt;We currently have not rolled any of this into production but I am confident that it could be done quickly and safely.  Since we are into real data taking, safely is the keyword.  We want no unnecessary downtimes, which I think is achievable.&lt;br /&gt;&lt;br /&gt;Thanks to Oscar at Nikhef for answering questions.&lt;br /&gt;&lt;br /&gt;1: there appeared to be a certificate permission issue when calling SCAS from CREAM that prevented job submission. It looks like you need to copy the hostcert/key by hand to another cert owned by the tomcat user.&lt;br /&gt;&lt;br /&gt;&lt;code&gt;&lt;br /&gt;-rw-r--r--   1 tomcat tomcat   2187 Dec  4 10:44 tomcathostcert.pem&lt;br /&gt;-r--------   1 tomcat tomcat   1863 Dec  4 10:44 tomcathostkey.pem&lt;br /&gt;&lt;/code&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-4180498142231712187?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/4180498142231712187/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=4180498142231712187' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4180498142231712187'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4180498142231712187'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/12/lightening-testing-of-glexec-with-scas.html' title='lightening testing of glexec with SCAS'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-6529349369936000706</id><published>2009-11-27T12:58:00.003Z</published><updated>2009-11-27T13:23:25.400Z</updated><category scheme='http://www.blogger.com/atom/ns#' term='mysql'/><category scheme='http://www.blogger.com/atom/ns#' term='WMS'/><title type='text'>Mysql binary logging revisited</title><content type='html'>After &lt;a href="http://scotgrid.blogspot.com/2009/08/database-backups-and-lock-time.html"&gt;last time,&lt;/a&gt; I'd poked at the LB servers databases, so we were getting effectively lock free backups, on one of the servers.&lt;br /&gt;&lt;br /&gt;In the intervening period, after it was seen to be stable, I did the same for the other server.&lt;br /&gt;&lt;br /&gt;However, Mike noted that the disk space used for the logs was growing rapidly.  (I blame those pesky LHC physicists.  Running jobs on our systems - anyone would think there was data to analyse or something ...).  Because we're running with LB servers on the same machines as the WMS, this means that the /var partition contains both the database files, and the users sandbox - hence the old binary logs take space away from users stuff.  (That's something to think about for the reinstall - might be worth separating them).&lt;br /&gt;&lt;br /&gt;Time to automate log triming.  Firstly, the manual side:  the statement&lt;br /&gt;&lt;br /&gt;  PURGE BINARY LOGS BEFORE '2009-11-01 00:00:00';&lt;br /&gt;&lt;br /&gt;to the server trims out some of the older logs.  &lt;a href="http://dev.mysql.com/doc/refman/4.1/en/purge-binary-logs.html"&gt;You can also&lt;/a&gt; trim up to a given file.&lt;br /&gt;&lt;br /&gt;Better than that, however, is to put&lt;br /&gt;&lt;br /&gt;  expire_logs_days=8&lt;br /&gt;&lt;br /&gt;in the my.cnf.  &lt;a href="http://dev.mysql.com/doc/refman/4.1/en/server-system-variables.html#sysvar_expire_logs_days"&gt;This tells mysql&lt;/a&gt; to retire logs older than 8 days at server start up, or when the logs are flushed.&lt;br /&gt;&lt;br /&gt;So as long as we ensure that when we take a full backup we flush the logs, then logs are automatically trimmed to just over a weeks worth.  Adding that parameter to the mysqldump script, and we're done.&lt;br /&gt;&lt;br /&gt;The binary logs have value, independant of the backups - there's a tool to read them, and look at what was happening.  Whether 8 days is the best level for us is something that we'll have to monitor - arguments for shorter time periods seem stronger than for longer.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-6529349369936000706?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/6529349369936000706/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=6529349369936000706' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/6529349369936000706'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/6529349369936000706'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/11/mysql-binary-logging-revisited.html' title='Mysql binary logging revisited'/><author><name>Stuart Purdie</name><uri>http://www.blogger.com/profile/08473287949581285669</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-7213676050663679379</id><published>2009-11-25T10:28:00.003Z</published><updated>2009-11-25T10:40:20.101Z</updated><title type='text'>Torque 2.4.2 to the rescue</title><content type='html'>I previously blogged about our Torque 2.3.6 on SL5 mom's continually &lt;a href="http://scotgrid.blogspot.com/2009/11/segfaulting-pbsmoms.html"&gt;seg-faulting&lt;/a&gt;.  At first we thought it was a bitness issue 32/64 between our SL4 and SL5 mom's running through the same pbs_server.  However, a quick test with the SL4 nodes removed proved that this was not the case.  A trawl through the source proved unproductive.  &lt;br /&gt;&lt;br /&gt;Therefore, it was time to go to Plan B.  To that end I have built the latest Torque release 2.4.2 and tested this on our pre-prod staging cluster.  This worked well with a configuration of 2.3.6 server and 2.4.2 mom's.  The next test was a test on a single node in production.  This was successful and was running jobs fine when all the other mom's seg-faulted again.  The 2.4.2 mom survived this and continued to run.  So a full roll-out is under way.  We will think about upgrading the server at a later date.  The only point to note is that we have to fully drain a node before doing the upgrade which is pain.  It does attempt a job conversion but these are unsuccessful as far as we can tell and you end up with dead job holding onto job slots.&lt;br /&gt;&lt;br /&gt;So the moral of the story is stay away from 2.3.6 and go to 2.4.2 instead.&lt;br /&gt;It is pretty easy to build but I have hosted our build &lt;a href="http://ppewww.physics.gla.ac.uk/~dmcnab/index.html"&gt;here&lt;/a&gt; for anyone that wants them.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-7213676050663679379?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/7213676050663679379/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=7213676050663679379' title='2 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/7213676050663679379'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/7213676050663679379'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/11/torque-242-to-rescue.html' title='Torque 2.4.2 to the rescue'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>2</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-9215446376594359885</id><published>2009-11-24T15:51:00.004Z</published><updated>2009-11-25T10:27:58.293Z</updated><title type='text'>A tale of two job managers</title><content type='html'>A while back I posted about supporting SL4 and SL5 OS's through the same batch system.  Our solution was to use &lt;a href="http://scotgrid.blogspot.com/2009/09/torque-submit-filters.html"&gt;torque submit filters&lt;/a&gt; to add additional node properties to the jobs as they passed through the job managers on the CE's.  This coupled with specific node properties on all nodes on the cluster worked quite well until I noticed that we were leaking CREAM jobs that should have requested SL5 running on the SL4 nodes.  &lt;br /&gt;&lt;br /&gt;After some investigation it appeared that when I was testing the filter and running the cream pbs qsub submit by hand I was always setting the number of nodes, even if I only required 1 i.e.&lt;br /&gt;&lt;pre&gt; &lt;br /&gt;as a pool account .... &lt;br /&gt;/opt/glite/bin/pbs_submit.sh -c /bin/hostname -q q1d -n 1&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;This meant that there was always a &lt;tt&gt;#PBS -l nodes=&lt;someNumber&gt;&lt;/tt&gt; in the submission script.  However, if you call pbs_submit without the &lt;tt&gt;-n&lt;/tt&gt; you get behaviour where no &lt;tt&gt;#PBS -l nodes=&lt;someNumber&gt;&lt;/tt&gt; line appears in the final submission script. This then relies on the default behaviour that if no number of nodes is specified you get 1 node.  This meant that my pbs filter did not catch the number of nodes and did not add the node property at all!&lt;br /&gt;&lt;br /&gt;As it turns out on deeper investigation into the CREAM pbs_submission script.  That when 1 node is required it uses the pbs default behaviour and does not specify a number of nodes.  Only when there is more than one does it specify this i.e. MPI.  This is a change from the lcg_CE job manager which always specifies a number of nodes be it 1 or more.  Something to remember.&lt;br /&gt;&lt;br /&gt;To get round this I have added an additional line to the cream pbs submit script to always default to 1 node if not MPI.  Not the best but it's a short lived tweak until we get rid of our SL4 support.  This should be very soon.&lt;br /&gt;&lt;br /&gt;/opt/glite/bin/pbs_submit.sh&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;[ ! -z "$bls_opt_mpinodes" ] || echo "#PBS -l nodes=1" &gt;&gt; $bls_tmp_file&lt;br /&gt;&lt;/pre&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-9215446376594359885?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/9215446376594359885/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=9215446376594359885' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/9215446376594359885'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/9215446376594359885'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/11/tale-of-two-job-managers.html' title='A tale of two job managers'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-3640759681518761701</id><published>2009-11-19T13:40:00.004Z</published><updated>2009-11-19T13:58:46.875Z</updated><title type='text'>CE Publishing</title><content type='html'>&lt;span style="font-weight:bold;"&gt;The Problem&lt;/span&gt;&lt;br /&gt;&lt;br /&gt;Publishing an inhomogeneous site 'correctly' is not trivial.  This is now required in order to pass the new gstat2 Nagios tests. Things to remember -&lt;br /&gt;&lt;br /&gt;    * Physical is sockets/CPU's and Logical is Cores.&lt;br /&gt;    * Physical * Cores = Logical in order to pass the new central Nagios tests. &lt;br /&gt;&lt;br /&gt;If your cluster is inhomogeneous then you need to be able to publish both clusters separately or as one or come up with a fudged number.  It is made harder as we have one batch system with multiple CE's submitting to it. &lt;br /&gt;&lt;br /&gt;&lt;span style="font-weight:bold;"&gt;Some Solutions&lt;/span&gt;&lt;br /&gt;&lt;br /&gt;    * Sub-Clusters [ what we have implemented at Glasgow ] &lt;br /&gt;    * Publishing decimal for cores &lt;br /&gt;&lt;br /&gt;our implementation is discussed &lt;a href=" https://www.scotgrid.ac.uk/wiki/index.php/Glasgow_Glasgow_GLite_CE_Publishing_Tips"&gt;here.&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;Please let me know if anything is wrong with this and I will update.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-3640759681518761701?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/3640759681518761701/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=3640759681518761701' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/3640759681518761701'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/3640759681518761701'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/11/ce-publishing.html' title='CE Publishing'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-6601959867371529198</id><published>2009-11-19T11:17:00.003Z</published><updated>2009-11-19T11:22:06.582Z</updated><title type='text'>Segfaulting PbsMoms</title><content type='html'>We have an issue with segfaulting mom's that seems correlated with the server trying to ping it's moms.  The server are version is torque-2.3.6-2cri.x86_64&lt;br /&gt;We are currently supporting two OS's through the same batch system using submit filter and node properties.   Therefore, we have two different versions of moms.&lt;br /&gt;Nodes 1-&gt;295 have moms torque-2.3.6-2cri.x86_64 and 296-&gt;309 have moms torque-2.1.9-4cri.slc4.i386&lt;br /&gt;&lt;br /&gt;When the moms segfault we see that the torque-2.1.9 moms stay up and only the torque-2.3.6 moms all die.  I ran one of them through GDB and can see the call stack:&lt;br /&gt;&lt;code&gt;&lt;br /&gt;(gdb) where&lt;br /&gt;#0  mom_server_find_by_ip (search_ipaddr=177078032) at mom_server.c:450&lt;br /&gt;#1  0x000000000041965e in mom_server_valid_message_source (stream=0) at mom_server.c:2022&lt;br /&gt;#2  0x0000000000419870 in is_request (stream=0, version=1, cmdp=0x7fffff542ae8) at mom_server.c:2125&lt;br /&gt;#3  0x0000000000416997 in do_rpp (stream=0) at mom_main.c:5351&lt;br /&gt;#4  0x0000000000416a52 in rpp_request (fd=&lt;value optimized out&gt;) at mom_main.c:5408&lt;br /&gt;#5  0x00002ae8ae9f3bc8 in wait_request (waittime=&lt;value optimized out&gt;, SState=0x0) at ../Libnet/net_server.c:469&lt;br /&gt;#6  0x0000000000416c1d in main_loop () at mom_main.c:8046&lt;br /&gt;#7  0x0000000000416ee1 in main (argc=1, argv=0x7fffff5431d8) at mom_main.c:8148&lt;br /&gt;(gdb) run&lt;br /&gt;The program being debugged has been started already.&lt;br /&gt;Start it from the beginning? (y or n) n&lt;br /&gt;Program not restarted.&lt;br /&gt;(gdb) bt full&lt;br /&gt;#0  mom_server_find_by_ip (search_ipaddr=177078032) at mom_server.c:450&lt;br /&gt;       __v = &lt;value optimized out&gt;&lt;br /&gt;       pms = (mom_server *) 0x6cbb80&lt;br /&gt;       addr = &lt;value optimized out&gt;&lt;br /&gt;#1  0x000000000041965e in mom_server_valid_message_source (stream=0) at mom_server.c:2022&lt;br /&gt;       addr = (struct sockaddr_in *) 0x187ef434&lt;br /&gt;       pms = (mom_server *) 0x0&lt;br /&gt;       id = 0x43be08 "mom_server_valid_message_source"&lt;br /&gt;#2  0x0000000000419870 in is_request (stream=0, version=1, cmdp=0x7fffff542ae8) at mom_server.c:2125&lt;br /&gt;       command = &lt;value optimized out&gt;&lt;br /&gt;       ret = 0&lt;br /&gt;       pms = &lt;value optimized out&gt;&lt;br /&gt;       ipaddr = &lt;value optimized out&gt;&lt;br /&gt;       id = "is_request"&lt;br /&gt;&lt;/code&gt;&lt;br /&gt;&lt;br /&gt;So it looks like time to dive through the source for &lt;code&gt;mom_server_find_by_ip (search_ipaddr=177078032) at mom_server.c:450&lt;/code&gt; or install torque-2.4!&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-6601959867371529198?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/6601959867371529198/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=6601959867371529198' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/6601959867371529198'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/6601959867371529198'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/11/segfaulting-pbsmoms.html' title='Segfaulting PbsMoms'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-8519147820980714942</id><published>2009-11-17T11:24:00.004Z</published><updated>2011-05-06T13:37:57.136+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='CE'/><category scheme='http://www.blogger.com/atom/ns#' term='arc'/><title type='text'>Arc, authorisation and LCMAPS</title><content type='html'>As a gLite site, it would be ideal if we could have the same user mapping between certificate DN's, and unix user names that is used with our existing CE's.&lt;br /&gt;&lt;br /&gt;Which means using the gLite LCMAPS to make decisions about what username each user has.&lt;br /&gt;&lt;br /&gt;This is supported in Arc, but it's not in the same fashion.&lt;br /&gt;&lt;br /&gt;The best approach appears to be:  Have an initial mapping listed in the grid-mapfile (There's utilities to make this easy).  This allows a first pass of authorisation. Then, in the gridFTP server, the mapping rules in there are applied next - this is where LCMAPS applies. &lt;br /&gt;&lt;br /&gt;Interestingly, Arc makes it very easy to do the thing we found hard with LCMAPS - to have a small set of 'local' users with fixed permanent mappings (independant of VO), and VO based pool accounts for other users.&lt;br /&gt;&lt;br /&gt;However, it's in the LCMAPS integration that things get a bit stuck. &lt;br /&gt;&lt;br /&gt;It's a silly 32/64 bitness issue.  On a 64 bit system, yum pulls out the 64bit Arc - as you might expect.  Sadly, there's not a 64 bit version of LCMAPS in the repositories as yet.&lt;br /&gt;&lt;br /&gt;So it's a case of hacking what I need out of etics.  I'll post a recipe when I have one, but this is a pretty tempory situation - it looks like Oscar pretty much LCAS/LCMAPS ready, but they're not a separate package, so are waiting on the SCAS, CREAM or WMS SL5-64bit packages.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-8519147820980714942?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/8519147820980714942/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=8519147820980714942' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/8519147820980714942'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/8519147820980714942'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/11/arc-authorisation-and-lcmaps.html' title='Arc, authorisation and LCMAPS'/><author><name>Stuart Purdie</name><uri>http://www.blogger.com/profile/08473287949581285669</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-4806866323205565008</id><published>2009-11-11T12:35:00.002Z</published><updated>2009-11-11T12:41:38.187Z</updated><title type='text'>nmon</title><content type='html'>Seeing sams post about NFS prompted me to mention 'nmon' - its kinda like 'top' on steroids and does particularly useful trend plotting. Originally a hack for AIX but ported to linux once a certain vendor realised people weren't just buying powerpc systems....&lt;br /&gt;&lt;br /&gt;Anyway - go grab from &lt;a href="http://www.ibm.com/developerworks/aix/library/au-analyze_aix/"&gt;http://www.ibm.com/developerworks/aix/library/au-analyze_aix/&lt;/a&gt; - the linux version is now opensource I see.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-4806866323205565008?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/4806866323205565008/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=4806866323205565008' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4806866323205565008'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4806866323205565008'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/11/nmon.html' title='nmon'/><author><name>Elwell</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='21' height='32' src='http://www.physics.gla.ac.uk/~aelwell/mugshot.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-3051488271272068937</id><published>2009-11-11T11:22:00.002Z</published><updated>2009-11-11T11:35:18.980Z</updated><title type='text'>NFS Load Tweaks: a Brief Guide for the Interested Enthusiast</title><content type='html'>I was asked about the mystery of NFS server tweaking in a dteam meeting, so I thought I'd compile this brief blog post.&lt;br /&gt;As with all actions, there are two steps: first, gather your information, second, act on this information. &lt;br /&gt;&lt;br /&gt;1) Determining your current NFS load statistics.&lt;br /&gt;&lt;br /&gt;NFS logs useful information in its /proc entry...&lt;br /&gt;&lt;br /&gt;so:&lt;br /&gt;&lt;br /&gt;&gt; cat /proc/net/rpc/nfsd&lt;br /&gt;&lt;br /&gt;rc 0 28905480 1603148913&lt;br /&gt;fh 133 0 0 0 0&lt;br /&gt;io 3663786355 2268252&lt;br /&gt;th 63 362541 16645.121 3156.556 747.974 280.920 148.129 100.155 61.480&lt;br /&gt;42.249 40.829 90.461&lt;br /&gt;ra 256 1069115586 4089582 3055815 2625032 2228952 2114496 1983622&lt;br /&gt;1765372 1743563 1610465 89609536&lt;br /&gt;net 1634942152 0 1634971040 2214677&lt;br /&gt;rpc 1630024431 0 0 0 0&lt;br /&gt;proc2 18 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0&lt;br /&gt;proc3 22 1573543 1535237104 8743056 1545350887 1532645717 29571823&lt;br /&gt;1179900114 9214599 6691508 538717 366274 0 2801854 39816 505310 4298&lt;br /&gt;2486034 62181794 53164 2414727 0 986878&lt;br /&gt;proc4 2 0 0&lt;br /&gt;&lt;br /&gt;This somewhat arcane looking output is full of variously useful&lt;br /&gt;statistics about your nfs daemon.&lt;br /&gt;&lt;br /&gt;The "rc" (read cache) field gives the fraction of cache hits, misses&lt;br /&gt;and "nocache" (interactions which bypassed the cache) for read&lt;br /&gt;operations.&lt;br /&gt;&lt;br /&gt;The "fh" (file handle) field's most important entry is the first - the&lt;br /&gt;number of stale file handles in the system. If you have flaky NFS, for&lt;br /&gt;example, this will be non-zero.&lt;br /&gt;&lt;br /&gt;The io field is simple cumulative io (read, and then written) in bytes.&lt;br /&gt;&lt;br /&gt;The "th" (threads) field is the most interesting field for NFS load&lt;br /&gt;optimisation. The first entry is the total number of threads currently&lt;br /&gt;executing. The second is the number of seconds (?) all threads were in use&lt;br /&gt;(which means your NFS was maxed out in active connections).  The&lt;br /&gt;remaining 10 entries are a histogram of NFS thread utilisation, in&lt;br /&gt;seconds (it seems to be hard to get NFS to reset this; restarting the&lt;br /&gt;daemon definitely doesn't). Plotting this gives you an idea of how&lt;br /&gt;much time your NFS server spends in various load states.&lt;br /&gt;Ideally, you want the last entry (90-100% use) to be comfortably in&lt;br /&gt;the tail of your distribution...&lt;br /&gt;If you have indications that your server spends a lot of its time with&lt;br /&gt;all threads in use, you should increase the maximum number of threads&lt;br /&gt;- powers of 2 are recommended.&lt;br /&gt;&lt;br /&gt;The "ra" (read-ahead cache) field gives similar results, but for the&lt;br /&gt;read-ahead cache. The first number is the size of the cache, the next&lt;br /&gt;10 are a histogram showing how far into the cache entries were found&lt;br /&gt;(so, the first number is the number of times an entry was read from&lt;br /&gt;the first 10% of the cache), and the last is for cache misses.&lt;br /&gt;Obviously, if you're getting a lot of cache misses *and* your cache&lt;br /&gt;hits histogram is heavily right-skewed, it's worth increasing the&lt;br /&gt;cache size. (Conversely, if you have a heavily left-skewed histogram,&lt;br /&gt;and few cache misses, you may be able to manage with a smaller cache.)&lt;br /&gt;&lt;br /&gt;The remaining fields are rpc process info fields, which are less&lt;br /&gt;relevant to us for our purposes.&lt;br /&gt;&lt;br /&gt;2. Optimising your NFS.&lt;br /&gt;&lt;br /&gt;The most important things to ensure are that there are enough&lt;br /&gt;resources for the peak load on your NFS service. NFS will spawn new&lt;br /&gt;threads to handle new active connections, and if its max-threads limit&lt;br /&gt;is too low, you'll get brown-outs under high load.&lt;br /&gt;Starting at least four instances of nfsd per processor (and, on modern&lt;br /&gt;processors, up to 8 per core) is recommended as a sensible&lt;br /&gt;configuration. You can set this on the command line for the nfsd&lt;br /&gt;service by simply using the bare number as an option.&lt;br /&gt;&lt;br /&gt;And, of course, if you can bear the risk of data-loss (or silent data&lt;br /&gt;corruption!) on sudden server loss, setting the export option "async"&lt;br /&gt;trivially increases your network throughput by removing the need for&lt;br /&gt;confirmation and syncing of writes between clients and server.&lt;br /&gt;See the NFS config faq at:&lt;br /&gt;&lt;a href="http://nfs.sourceforge.net/#section_b"&gt;http://nfs.sourceforge.net/#section_b&lt;/a&gt;&lt;br /&gt;for more details.&lt;br /&gt;&lt;br /&gt;You may also wish to do the standard setting of packet sizes with&lt;br /&gt;respect to MTU that you would normally do for a network-based&lt;br /&gt;protocol. The general process (and some more details) are covered at:&lt;br /&gt;&lt;a href="http://nfs.sourceforge.net/nfs-howto/ar01s05.html"&gt;http://nfs.sourceforge.net/nfs-howto/ar01s05.html&lt;/a&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-3051488271272068937?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/3051488271272068937/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=3051488271272068937' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/3051488271272068937'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/3051488271272068937'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/11/nfs-load-tweaks-brief-guide-for.html' title='NFS Load Tweaks: a Brief Guide for the Interested Enthusiast'/><author><name>Sam Skipsey</name><uri>http://www.blogger.com/profile/10165998351125446764</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-188324660800481813</id><published>2009-11-06T13:33:00.003Z</published><updated>2009-11-06T13:52:22.564Z</updated><category scheme='http://www.blogger.com/atom/ns#' term='CE'/><category scheme='http://www.blogger.com/atom/ns#' term='arc'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><title type='text'>Arc, and the installation</title><content type='html'>We've been fiddling with the NorduGrid Arc middleware a bit.  Not just out of random curiosity, but more trying to get a handle on the workloads that it suits better than gLite, and vice versa.  It does a number of things differently, and by running an Arc CE in parallel with an lcg-CE and CREAM, we can do some solid comparisons.  Oh, and the name of the middleware is also much more amenable to puns, so expect a few groaners too.&lt;br /&gt;&lt;br /&gt;So, consider this the first in a series.  During this process, we expect to end up with a set of notes on how to install and run an Arc setup, for people already familiar with gLite.&lt;br /&gt;&lt;br /&gt;Firstly, install.  We took a blank SL5 box, added the &lt;a href="http://download.nordugrid.org/repos.html"&gt;nordugrid repo's&lt;/a&gt;, and then&lt;br /&gt;&lt;br /&gt;  yum groupinstall "ARC Server"&lt;br /&gt;  yum groupinstall "ARC Client"&lt;br /&gt;&lt;br /&gt;Well, very nearly.  There's one more thing needed, which is to add the EPEL dependancies (libVOMS is the key lib)&lt;br /&gt;&lt;br /&gt;  yum install yum-conf-epel&lt;br /&gt;&lt;br /&gt;The next step is to configure it.  That's all done in /etc/arc.conf, and is the subject for later posts.&lt;br /&gt;&lt;br /&gt;There is a need for a filesystem shared between the CE and the worker nodes, so we fired up a spare disk server for NFS.&lt;br /&gt;&lt;br /&gt;Startup is three systems, already configured in /etc/init.d :  gridftp, grid-infosys and grid-manager.&lt;br /&gt;&lt;br /&gt;Ta-da!  A running Arc CE.&lt;br /&gt;&lt;br /&gt;Ok, so there's a fair bit glossed over in the configuration step.  Next time, I'll talk about how I configured it to work with our existing queues - and where the expectations for Arc differ from gLite.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-188324660800481813?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/188324660800481813/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=188324660800481813' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/188324660800481813'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/188324660800481813'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/11/arc-and-installation.html' title='Arc, and the installation'/><author><name>Stuart Purdie</name><uri>http://www.blogger.com/profile/08473287949581285669</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-4096543540982022493</id><published>2009-10-30T17:55:00.001Z</published><updated>2009-10-30T17:57:23.632Z</updated><title type='text'>worker node on demand</title><content type='html'>Virtualisation is a hot topic again for grid services and worker node on demand&lt;br /&gt;&lt;br /&gt;KVM, XEN, VMWARE - Everyone using different ones.&lt;br /&gt;Virtualisation for cloud - &lt;a href="http://workspace.globus.org/"&gt;Nimbus&lt;/a&gt;, &lt;a href="http://www.opennebula.org/"&gt;Open Nebula&lt;/a&gt;, &lt;a href="http://www.eucalyptus.com/"&gt;eucalyptus&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;the future... ??&lt;br /&gt;1. plain signed virtual images transported from site to site.&lt;br /&gt;2. virtual images including experiment software.&lt;br /&gt;3. connecting to pilot job frameworks, instantiated with virtual images,&lt;br /&gt;4. pilot frameworks replaced by commercial domain schedulers.  virtual clusters.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-4096543540982022493?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/4096543540982022493/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=4096543540982022493' title='2 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4096543540982022493'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4096543540982022493'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/10/worker-node-on-demand.html' title='worker node on demand'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>2</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-7486402716588094066</id><published>2009-10-26T17:54:00.017Z</published><updated>2009-10-27T03:54:06.329Z</updated><title type='text'>HEPIX is GO!</title><content type='html'>&lt;a href="http://indico.cern.ch/conferenceTimeTable.py?confId=61917"&gt;HEPIX Workshop&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;Site Reports Session&lt;br /&gt;&lt;br /&gt;CERN:&lt;br /&gt;Getting serious about ITIL.  Solaris being phased out.  Getting serious about 10GigE.&lt;br /&gt;Lustre pilot project.  New purchases discussed.&lt;br /&gt;&lt;br /&gt;JLAB:&lt;br /&gt;New LQCG Cluster "2009 Quad Infiniband - ARRA Cluster"&lt;br /&gt;Storage - Whitebox 14 AMAXservers Solaris w/ZFS or Lustre&lt;br /&gt;Compute - DellpowerEdge R4102 x4 Ghz QDR Infiiband, 24Gb RAM&lt;br /&gt;&lt;br /&gt;Auger Cluster Upgraded &lt;br /&gt;Nehalems - intel x5530 dual cpu, quad core, 24MB RAM, 500GB SATA &lt;br /&gt;(seeing i/o contention on disk when running 14/16 jobs)&lt;br /&gt;OS Switch from Fedora 8 32bit, to CentOS 5.3 64bit&lt;br /&gt;&lt;br /&gt;No real Grid Computing&lt;br /&gt;IBM TS3500 tape library installed.  StorageTek Powderhorn silos replaced.&lt;br /&gt;80 production VM's  VMWare ESX3.5 planned to move to vSphere4.0&lt;br /&gt;&lt;br /&gt;GSI: &lt;br /&gt;FAIR - new accelerator discussion.  The futuristic talk!&lt;br /&gt;The Cube DataCentre Building: 1000 19" water cooled racks held in 26x26x26 cube building.  Lifts to reach the machines.  Iron structure for racks to sit on.  &lt;br /&gt;&lt;br /&gt;CINP2P3 LYON:&lt;br /&gt;T1 4LHC &amp; D0, Babar, SL5 migration in Q2 2010 for both Main Cluster and MPI Cluster.  New Purchases and New Server Building.&lt;br /&gt;&lt;br /&gt;STORAGE Session&lt;br /&gt;&lt;br /&gt;Your File System NexGen openAFS (Jeffery Altman):&lt;br /&gt;&lt;a href="http://your-file-system.com/"&gt;YFS&lt;/a&gt; now funded by US Gov to create nextgen openAFS.  2 year funding.  Deliverables included assessment of current AFS and  2 year upgrade plan to client and server for YFS deliverable.  Still open source.&lt;br /&gt;&lt;br /&gt;Storm and Lustre:&lt;br /&gt;IOZONE discussion, Hammer-cloud Tests Discussion, Benchmarking summary, Good Results, performance below iozone tests.  WMS jobs and Panda jobs different.  file::// protocol support performs well but requires the VO to support it.  Open questions:  Lustre Striping (should yes or no).  Performance (Raid config?), Monitoring - still work to be done, Support - Kernel Upgrades can take a while to be made available and Benchmarks - are they realistic?  Tuning still to do.&lt;br /&gt;&lt;br /&gt;Lustre at GSI:&lt;br /&gt;Users - Alice Analysis for Tier2, GSI Exp, FAIR Simulations.  Still on 1.6.7.2 1Pbtye, &gt; 3000 nodes.  Foundry RX32 ethernet switch.   MDS HA Pair, one standby. 84 OSS, 200 OSTs.  MDS 8 core, 3GHz Xeon, 32Bb RAM.  Real throughput testing with Alice Analysis Train. 50Gbit/s using 2000 cores.  Hardware and Software issues.  Complex system and vulnerable to network communications.  Using&lt;a href="http://robinhood.sourceforge.net/"&gt; Robin Hood Filesystem Monitor&lt;/a&gt; for audit and management.  This protects the MDS by directing requests to MYSQL instance. i.e top ten users, file moves etc.  Using this rather than e2Scan.&lt;br /&gt;&lt;br /&gt;Hadoop on your worker nodes using local hard drives &amp; Fuse:&lt;br /&gt;Hadoop compared against Lustre.  Performed well when 8 jobs ran.  Replication of files provides redundancy. Cost and maintenance factor very favourable to small sites. Deployed in some sites in the US.  Not a really Tier 1 deployable solution.  Name node redundancy exists (will lose at most one transaction) - requires additional software.&lt;br /&gt;&lt;br /&gt;Virtualization Session&lt;br /&gt;&lt;br /&gt;lxcloud at CERN: &lt;br /&gt;Cern has developed a proof of concept for virtualized worker nodes.  'Golden nodes' serving images to the Xen Hypervisors using Open Nebula. Also looked at Platform's VMO.  Production lxcloud being built.  10 machines, 24GB, 2TB disk dual Nehalem.  Starting with Xen.  Production release by March 2010.    Memory an issue as the HyperVisor requires some memory i.e. with 16GB RAM you cannot run 8 2GB VM's.  &lt;br /&gt;&lt;br /&gt;Fermigrid: &lt;br /&gt;Has moved much of its infrastructure to Xen HyperVisor.  Looks like a solid infrastructure. Investigating KVM with the possibility of a move in the next few years if it proves to be better.   INFN mentioned &lt;a href="http://indico.cern.ch/contributionDisplay.py?contribId=5&amp;sessionId=7&amp;confId=45282"&gt;Xen vs KVM&lt;/a&gt; at Hepix Spring 2009 for discussion of differences.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-7486402716588094066?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/7486402716588094066/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=7486402716588094066' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/7486402716588094066'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/7486402716588094066'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/10/hepix-is-go.html' title='HEPIX is GO!'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-7428680661053956684</id><published>2009-10-19T16:52:00.002+01:00</published><updated>2009-10-19T16:59:55.073+01:00</updated><title type='text'>Another new VO at Glasgow</title><content type='html'>Today I finally got time to create a new VO for our new users in Solid State Physics. &lt;pre&gt;vo.ssp.ac.uk&lt;/pre&gt;  This is now active across the cluster and users can sign up to the VO from our voms server on &lt;a href="https://svr029.gla.scotgrid.ac.uk:8443/voms/vo.ssp.ac.uk/StartRegistration.do"&gt;svr029&lt;/a&gt; and will be used to host users of &lt;a href="http://www.castep.org/"&gt;CASTEP&lt;/a&gt; and other departmental SSP users.&lt;br /&gt;&lt;br /&gt;Our local wiki page on running &lt;a href="http://www.castep.org/"&gt;CASTEP&lt;/a&gt; at &lt;a href="https://www.scotgrid.ac.uk/wiki/index.php/Using_CASTEP_Package"&gt;Glasgow&lt;/a&gt;. Only the MPI version to get working now.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-7428680661053956684?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/7428680661053956684/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=7428680661053956684' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/7428680661053956684'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/7428680661053956684'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/10/another-new-vo-at-glasgow.html' title='Another new VO at Glasgow'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-3135397235713448751</id><published>2009-10-12T17:08:00.005+01:00</published><updated>2009-10-19T16:52:21.163+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='SSP'/><category scheme='http://www.blogger.com/atom/ns#' term='SL5'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><category scheme='http://www.blogger.com/atom/ns#' term='MPI'/><title type='text'>CASTEP, A Test of True Grid</title><content type='html'>Along came another users with a requirement for MPI.  Can we run it?  Well yes you can but remember our interconnects are just plain old Ethernet and nothing fancy like Myrinet or Infiniband.  We are not a HPC cluster but an HTC cluster.&lt;br /&gt;&lt;br /&gt;So we have been building CASTEP, an f90 code, heavy on the MPI scatter/gather.  A test of true grid for any HTC cluster.  First off CASTEP requires a minimum of make3.81 and gfortran43.  Handy that we moved to SL5 as these are now the standard. Coupled with making sure that the required libs fftw3, blas and lapack are all built with the same compiler, gfortran43. This allowed the single core version to be built and installed onto the grid. &lt;br /&gt;&lt;br /&gt;An MPI version is turning out be a bit more work.  First off the old, outdated and no longer developed libs MPICH have not been built with .f90 support enabled by default. So we have got hold of the source to do a recompile with .f90 support on for gfortran43.  There also appeared to be a bug in the gfortran support.  So we had to patch the src rpm to include a patch that we located online.  This allowed us to finally build the mpich lib.  This has been tested with compilation of an MPI job in c and f90, both of which run successfully.&lt;br /&gt;&lt;br /&gt;Unfortunately CASTEP still doesn't run using it so more digging required.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-3135397235713448751?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/3135397235713448751/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=3135397235713448751' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/3135397235713448751'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/3135397235713448751'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/10/castep-test-of-true-grid.html' title='CASTEP, A Test of True Grid'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-8852679117552582903</id><published>2009-09-24T13:36:00.005+01:00</published><updated>2009-09-24T14:01:56.721+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='gqsub'/><title type='text'>gqsub at EGEE09</title><content type='html'>Just a short note from the EGEE 09 conference.  It's been very gratifying to have had so much interest in gqsub at the conference - I even had emails about it scant hours after the poster was put up (and before the offical poster session!).&lt;br /&gt;&lt;br /&gt;In response to the comments recieved, I've put a roadmap of planned features up on the &lt;a href="http://www.scotgrid.ac.uk/gqsub/"&gt;gqsub page&lt;/a&gt;, which gives an idea of where it's headed.&lt;br /&gt;&lt;br /&gt;In addition, v 1.2.0 is out, which implements auto staging back of output.  This means that in cases where there is &lt;span style="font-style: italic;"&gt;not&lt;/span&gt; a shared filesystem between the UI and the worker node, but there &lt;span style="font-style: italic;"&gt;is&lt;/span&gt; GridFTP server on the UI, then gqsub will pull out the JDL tricks we used earilier with the Lumerical deployment.  This results in the illusion of a shared filesystem - the job is submitted, and the output appears in the right places as if it was done in a shared filesystem.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-8852679117552582903?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/8852679117552582903/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=8852679117552582903' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/8852679117552582903'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/8852679117552582903'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/09/gqsub-at-egee09.html' title='gqsub at EGEE09'/><author><name>Stuart Purdie</name><uri>http://www.blogger.com/profile/08473287949581285669</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-8282871547481435933</id><published>2009-09-23T11:34:00.010+01:00</published><updated>2009-09-30T16:59:50.903+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='torque'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><title type='text'>torque submit filters</title><content type='html'>After debating whether to add node properties for SL4 and SL5 into the job managers for both cream and the lcg-ce I read Derek's post from RAL about using submit filters.  So I thought I would have a go and see if I could tweak the node specification, keep the number of nodes requested intact for MPI and add additional property for the particular CE.  Turns out its easy to implement but as usual there is some wierdness.   You should be able to write your filter in whatever language you like and just specify the torque.cfg i.e.&lt;br /&gt;&lt;br /&gt;Here is a simple example in bash:&lt;br /&gt;/usr/local/sbin# cat torque_submit_filter.sh&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;#!/bin/sh&lt;br /&gt;while read i&lt;br /&gt;  do&lt;br /&gt;    if [[ $i =~ "^#PBS -l nodes=[0-9]" ]]&lt;br /&gt;    then&lt;br /&gt;      export i="${i}:SL4"&lt;br /&gt;    fi&lt;br /&gt;    echo $i&lt;br /&gt;  done&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;&lt;br /&gt;/var/spool/pbs# cat torque.cfg&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;SUBMITFILTER /usr/local/sbin/torque_submit_filter.sh&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;&lt;br /&gt;This works with cream but not with the lcg-ce.&lt;br /&gt;&lt;br /&gt;So lets try again but this time in perl:&lt;br /&gt;&lt;br /&gt;/usr/local/sbin# cat torque_submit_filter.pl&lt;br /&gt;&lt;pre&gt;&lt;br /&gt;#!/usr/bin/perl -w&lt;br /&gt;&lt;br /&gt;use strict;&lt;br /&gt;&lt;br /&gt;# Echo all other input&lt;br /&gt;while (&lt;STDIN&gt;)&lt;br /&gt;  {&lt;br /&gt;    # By default just copy the line.&lt;br /&gt;    my $line = $_;&lt;br /&gt;&lt;br /&gt;    if ($line =~ m/^#PBS -l nodes=[0-9]/)&lt;br /&gt;      {&lt;br /&gt;        chomp($line);&lt;br /&gt;        $line = $line . ":SL5\n";&lt;br /&gt;      }&lt;br /&gt;&lt;br /&gt;    print ($line);&lt;br /&gt;  }&lt;br /&gt;&lt;/pre&gt;&lt;br /&gt;&lt;br /&gt;Now this works with both cream and lcg-ce!  Obviously you can do whatever takes your fancy to the qsub input and make it more intelligent.&lt;br /&gt;&lt;br /&gt;A word of warning.  We used the same queues for both CE's which meant that SL4 and SL5 resources were indistinguishable to users unless they used OS specific CE requirements.  We ended flooded on the SL4 queues, with lots of free slots on the SL5 queues.  So in the end we have created a new set of queues for the SL4 CE.  Hopefully this will be explicit enough for users to target the correct CE.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-8282871547481435933?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/8282871547481435933/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=8282871547481435933' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/8282871547481435933'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/8282871547481435933'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/09/torque-submit-filters.html' title='torque submit filters'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-4578767439198947376</id><published>2009-09-22T12:01:00.005+01:00</published><updated>2009-09-23T16:26:24.272+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='YAIM'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><category scheme='http://www.blogger.com/atom/ns#' term='Upgrade'/><title type='text'>Fun with 5!</title><content type='html'>So, the SL5 migration was done last week, and as promised at yesterday's dteam meeting I am posting our problems so that other sites can watch out for similar issues (although some of these are deeply related to the way we do things at Glasgow).&lt;br /&gt;&lt;br /&gt;First though, the successes:&lt;br /&gt;&lt;ol&gt;&lt;li&gt;1800 cores running SL5&lt;/li&gt;&lt;li&gt;DPM headnode upgraded to SL5&lt;/li&gt;&lt;li&gt;Torque server upgraded to SL5, running torque 2.3.6, maui 3.2.6p21&lt;/li&gt;&lt;li&gt;&lt;span style="font-family: courier new;"&gt;/atlas/uk&lt;/span&gt; voms group supported with a separate fairshare&lt;br /&gt;&lt;/li&gt;&lt;/ol&gt;Now, the list of problems:&lt;br /&gt;&lt;br /&gt;1. We introduced a new python script, &lt;span style="font-family:courier new;"&gt;gridAccounts.py&lt;/span&gt;, to generate pool accounts, retiring the venerable, but incomprehensible, &lt;span style="font-family:courier new;"&gt;genaccts.pl&lt;/span&gt; script we had before (Andy Elwell wrote that and his comment was "OK - I give up with python as I need this NOW..."; my retort was "I HATE PERL SO MUCH. IT'S A SHIT LANGUAGE.", but I had never found the time to rewite it until now). The new script reads standard config files, so it's a lot easier to manage, understand and extend. However, all change is (a bit) dangerous and the new script initially had groups in the wrong order in yaim's user.conf, which caused the groupmapfile to be wrong. This then caused all jobs to fail - the uid/gid of the gridftp session did not match the uid/primary gid of the user and gridftp does not like that at all.&lt;br /&gt;&lt;br /&gt;(The reason we have to write users.conf is because we still get yaim to do a lot, although we manage all accounts through cfengine; yaim relies on this file to configure various other aspects of the system, such as grid/group mapfiles.)&lt;br /&gt;&lt;br /&gt;2. We were trying to support the &lt;span style="font-family:courier new;"&gt;/atlas/uk&lt;/span&gt; VOMS group as a separate entity. This is simple in theory (!), you're looking for the following entries in voms-grid-mapfile:&lt;br /&gt;&lt;br /&gt;"/atlas/uk/Role=NULL/Capability=NULL" .ukatlas&lt;br /&gt;"/atlas/uk" .ukatlas&lt;br /&gt;&lt;br /&gt;and this in groupmapfile:&lt;br /&gt;&lt;br /&gt;"/atlas/uk/Role=NULL/Capability=NULL" atlasuk&lt;br /&gt;"/atlas/uk" atlasuk&lt;br /&gt;&lt;br /&gt;If we were managing these files directly, it would have been no problem. However, convincing YAIM to do this was far from easy. This is not helped by the fact that YAIM is now utterly incomprehensible in many ways (have a look at yaim/utils/users_getvogroup if you don't believe me). Finally we hit on the correct recipe, which is to have these accounts in users.conf, with a new "special" defined:&lt;br /&gt;&lt;br /&gt;201601:ukatlas001:201040,201000:atlasuk,atlas:atlas:uk:&lt;br /&gt;201602:ukatlas002:201040,201000:atlasuk,atlas:atlas:uk:&lt;br /&gt;201603:ukatlas003:201040,201000:atlasuk,atlas:atlas:uk:&lt;br /&gt;...&lt;br /&gt;&lt;br /&gt;with this line added to groups.conf:&lt;br /&gt;&lt;br /&gt;"/VO=atlas/GROUP=/atlas/uk":::uk:&lt;br /&gt;&lt;br /&gt;Aside: Sometimes I wonder if YAIM has outgrown its usefulness. From something we could understand and tweak easily it's now a sed|awk|cut|sort|tail black box monster, which uses a computerised format for configuration files. c.f. the configuration we have for our own scripts:&lt;br /&gt;&lt;br /&gt;[someuser]&lt;br /&gt;dn = /C=UK/O=eScience/OU=Glasgow/L=Compserv/CN=some user&lt;br /&gt;uid = 4832&lt;br /&gt;home = /clusterhome/home/someuser&lt;br /&gt;group = atlas&lt;br /&gt;tier25 = True&lt;br /&gt;vo = gla&lt;br /&gt;&lt;br /&gt;And trying to do grid configuration manipulations in a language which doesn't have dictionaries is just ridiculous.&lt;br /&gt;&lt;br /&gt;Maybe we'll need to wean ourselves off it eventually?&lt;br /&gt;&lt;br /&gt;3. Information publishing on svr018 was broken after the upgrade. There was a cryptic reference to a required 'dpminfo' user which Sam had made in his notes. Adding this user did seem to make things work, though it's not at all clear why. Hopefully Sam will enlighten us later. In passing, note that the resource BDII on the service nodes seems to be 'protected' now, so attempts to reach it from 'outside' fail. This is new behaviour and lost us some time in debugging.&lt;br /&gt;&lt;br /&gt;4. Terrible trouble was caused by upgrading the torque server to SL5. Using the SteveT build of torque server (http://skoji.cern.ch/sa1/centos5-torque/) seemed to cause grave problems with moms crashing on the worker nodes. Downgrading the moms to the torque 2.3.0 didn't work as the jobs files (/var/spool/pbs/mon_priv/jobs) seemed to be in incompatible formats and led to crashing moms plus a very confused torque server. Cleaning out all jobs seemed to not work either. The final solution was to rebuild torque 2.3.6 on SL5 - this gave a consistent and compatible server/mon pairing.&lt;br /&gt;&lt;br /&gt;A small side effect though, was the the rebuilt maui had a different 'secret' in it, so I have had to hack the info provider on the SL4 CEs to use the --keyfile= argument in the maui client commands. (That's such a stupid 'feature'.)&lt;br /&gt;&lt;br /&gt;5. Once we were out of downtime, random transfers to the DPM were failing. Eventually we tracked to the reduction in the number of pool accounts for atlasprd. There was no sync between the passwd fila and to the /etc/grid-security/gridmapdir pool account list, of course, so gridftp was was throwing a "530 Login incorrect. : No local mapping". We realised that&lt;br /&gt;&lt;ol&gt;&lt;ol&gt;&lt;li&gt;/etc/passwd should be handled better on nodes which need to map pool accounts.&lt;/li&gt;&lt;li&gt;For the moment never reduce the number of accounts!&lt;/li&gt;&lt;li&gt;N.B. on the CEs the gridmapdir is shared, so maintenance probably needs to be delegated&lt;/li&gt;&lt;li&gt;If we remove a pool account mapping then you have to remove the link from any DNs to this mapping as well (look for DN filenames with only 1 hard link).&lt;br /&gt;&lt;/li&gt;&lt;/ol&gt;&lt;/ol&gt;OK, that's it. We got there, though not without some anxious moments!&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-4578767439198947376?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/4578767439198947376/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=4578767439198947376' title='2 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4578767439198947376'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4578767439198947376'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/09/fun-with-5.html' title='Fun with 5!'/><author><name>Graeme Stewart</name><uri>http://www.blogger.com/profile/04113191724360870254</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='21' height='32' src='http://www.physics.gla.ac.uk/~graeme/graeme.jpg'/></author><thr:total>2</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-3365958783755074456</id><published>2009-09-22T11:28:00.004+01:00</published><updated>2009-09-22T11:50:32.992+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='WMS'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><title type='text'>wms myproxy renewal wobbles</title><content type='html'>During our recent reconfiguration to SL5 we also re-wrote our user account generation script from perl to python.  Well Graeme did actually.  So now its very easy to understand and extend.   A consequence of this was that we created a new &lt;vo&gt; directory in /home for each user to keep things neat and tidy.  This necessitated the recreation of all home directories across the cluster.  A task fraught with danger.  &lt;br /&gt;&lt;br /&gt;However, we managed it except that I blew away the glite user from the WMS in the process and the .certs and .globus certificates required to run the WMS.  After replacing them everything worked fine or so I thought.  Recently we received reports that the myproxy renewal was not working and as it transpired the /home/glite/.certs/hostkey.pem and /home/glite/.certs/hostcert.pem must be owned by the &lt;span style="font-weight:bold;"&gt;glite&lt;/span&gt; user and &lt;span style="font-weight:bold;"&gt;not&lt;/span&gt; root for the renewal process to work!  One to watch!&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-3365958783755074456?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/3365958783755074456/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=3365958783755074456' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/3365958783755074456'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/3365958783755074456'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/09/wms-myproxy-renewal-wobbles.html' title='wms myproxy renewal wobbles'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-3587117015889696993</id><published>2009-09-18T14:38:00.008+01:00</published><updated>2009-09-22T12:27:16.060+01:00</updated><title type='text'>SL5 migration and CPU deployment</title><content type='html'>As of last week have now migrated ScotGrid-Glasgow to SL5.  This meant worker nodes, DPM's and batch system all becoming SL5 in one big flurry of activity.  We started on Monday morning and came out of downtime on Wednesday evening with SAM tests passing.  Since then we have been mopping up the remaining issues that cropped up along the way but more on that later.  &lt;br /&gt;&lt;br /&gt;So as of 16th September 1800 jobs slots running SL5 out of a total of 1912.  The remaining 112 job slots have been held back as SL4 till December to allow those VO's with unpatched software kits or that are simply not ready to move to SL5 to run jobs.&lt;br /&gt;&lt;br /&gt;Similar to RAL and other sites we have gone with separate CE's between SL4 and SL5 to allow for those VO's that cannot co-exist on the same CE.  These CE's will very shortly be submitting to the same batch system using node requirements :SL4, :SL5 set from the CE as described by &lt;a href="http://southgrid.blogspot.com/2007/10/sl4-worker-node-migration-at-ralpp.html"&gt;SouthGrid&lt;/a&gt; from their SL3 to SL4 migration.  It does necessitate from job manager tweaking but it works.  I may try and switch this to submit filter when I get the chance as job manager tweaking is never very robust.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-3587117015889696993?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/3587117015889696993/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=3587117015889696993' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/3587117015889696993'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/3587117015889696993'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/09/sl5-migration-and-cpu-deployment.html' title='SL5 migration and CPU deployment'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-3262984592853387459</id><published>2009-09-09T11:58:00.004+01:00</published><updated>2009-09-24T13:36:28.735+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='gqsub'/><title type='text'>Canna I no just use qsub?</title><content type='html'>Ah, the endless refrain.&lt;br /&gt;&lt;br /&gt;Anytime a user with cluster experience is introduced to the gLite submission mechanism, some question of that order (although not always with a Scottish accent) is inevitable.&lt;br /&gt;&lt;br /&gt;Pulling out my Human-Computer Interaction hat, I first came to the conclusion that, despite the occasional hints to the contrary, users are indeed Human.  Hot on the heels of this realisation, a little bit of analysis of the gLite job submission and control tools indicated that, whilst very powerful, they work in a very different fashion to qsub.&lt;br /&gt;&lt;br /&gt;It's not clear that qsub is in any sense a &lt;span style="font-style: italic;"&gt;better &lt;/span&gt;iterface than the native command line tools, but it is clear that it is &lt;span style="font-style: italic;"&gt;different&lt;/span&gt;.&lt;br /&gt;&lt;br /&gt;The general idea was to resolve this difference by providing a different interface to grid job submission that was more familiar to users with existing experience of cluster computing.  Wether it's going to be a better approch for a user without that experience is not clear; but it will make it simpler for users to use the Grid as an offload for a local cluster (i.e. use a cluster, when it's full, send the jobs to the Grid).&lt;br /&gt;&lt;br /&gt;It turns out that the POSIX defintion of qsub isn't too far away, conceptually, from a Grid system, so all that was needed to act as an interface transalation layer was a relativly straightforward python script.&lt;br /&gt;&lt;br /&gt;Rather than relay all the gory details here, let me direct you to the &lt;a href="http://www.scotgrid.ac.uk/gqsub/"&gt;gqsub&lt;/a&gt; download page, with the manual.&lt;br /&gt;&lt;br /&gt;For users on svr020, it's installed in the default path, so you can just use it.  Note that to properly mirror the expected behaviour you probably want to make sure you run from within $CLUSTER_SHARED.&lt;br /&gt;&lt;br /&gt;But to answer the original question: &lt;a href="http://www.scotgrid.ac.uk/gqsub/"&gt;"Aye!"&lt;/a&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-3262984592853387459?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/3262984592853387459/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=3262984592853387459' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/3262984592853387459'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/3262984592853387459'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/09/canna-i-no-just-use-qsub.html' title='Canna I no just use qsub?'/><author><name>Stuart Purdie</name><uri>http://www.blogger.com/profile/08473287949581285669</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-4937795432748936553</id><published>2009-09-02T09:25:00.005+01:00</published><updated>2009-09-02T10:30:39.844+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='job wrapper'/><category scheme='http://www.blogger.com/atom/ns#' term='WMS'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><title type='text'>who changed the job wrapper?</title><content type='html'>It was a long night yesterday as Graeme and I tried to fix our failing ops CE tests.  It started on Monday night when SAM mysteriously started failing across all CE's at Glasgow and then Durham.  The jobs appeared to run but just stayed at the running state until the WMS presumably killed them and eventually failed ops tests.&lt;br /&gt;&lt;br /&gt;After investigation we noticed the 'cannot download .BrokerInfo from' error.  A quick look on a node proved that it was owned in /tmp by another user rather than ops.  A strace -f -p NNNN on the globus-url-copy command process showed the ops job was getting a permission denied when trying to create/copy the file.  A look at past CE-sft-broker tests showed the a very clear difference, in fact there was a missing directory!&lt;br /&gt;&lt;code&gt;&lt;br /&gt;-rw-r--r--  1 sgmops001 opssgm 3085 Aug 31 05:06 /tmp/https_3a_2f_2fwms208.cern.ch_3a9000_2fElSbIsNqd8SN69eCXPN1JA/.BrokerInfo&lt;br /&gt;&lt;/code&gt;&lt;br /&gt;&lt;code&gt;&lt;br /&gt;-rw-r--r--  1 sgmops001 opssgm 2312 Sep  1 22:34 /tmp/.BrokerInfo&lt;br /&gt;&lt;/code&gt;&lt;br /&gt;Removing this file allowed the ops test to run but why it was happening was still a mystery.  A work around we have deployed is to create an additional directory in cp_1.sh i.e.&lt;br /&gt;&lt;code&gt;&lt;br /&gt;# Workaround for gLite WMS jobs, which don't cd into EDG_WL_SCRATCH...&lt;br /&gt;echo In cp_1.sh&lt;br /&gt;echo Making temporary work directory&lt;br /&gt;templ=$TMPDIR/glite_run_XXXXXXXX&lt;br /&gt;temp=$(mktemp -d $templ)&lt;br /&gt;echo Changing work directory to $temp&lt;br /&gt;cd $temp&lt;br /&gt;&lt;/code&gt;&lt;br /&gt;In the end we had to remove every blocking .BrokerInfo file from /tmp across the cluster and ops jobs started passing again.  Further digging showed that the job wrapper has changed somewhere along the line.  The old job wrapper had code like this in it.&lt;br /&gt;&lt;code&gt;&lt;br /&gt;#if [ ${__job_type} -eq 0 -o ${__job_type} -eq 3 ]; then # normal or interactive&lt;br /&gt;  newdir="${__jobid_to_filename}"&lt;br /&gt;  mkdir ${newdir}&lt;br /&gt;  cd ${newdir}&lt;br /&gt;#elif [ ${__job_type} -eq 1 -o ${__job_type} -eq 2 ]; then # MPI (LSF or PBS)&lt;br /&gt;#fi&lt;br /&gt;&lt;/code&gt;  &lt;br /&gt;This has now been &lt;span style="font-weight:bold;"&gt;removed&lt;/span&gt; and could be causing issues for other sites.  Torque and SGE have functionality to ring-fence every job perhaps we would have been safer using it but running jobs from /tmp worked for 3 years.  Not any more it would seem.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-4937795432748936553?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/4937795432748936553/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=4937795432748936553' title='5 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4937795432748936553'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/4937795432748936553'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/09/who-changed-job-wrapper.html' title='who changed the job wrapper?'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>5</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-5260753523362657223</id><published>2009-08-28T16:53:00.003+01:00</published><updated>2009-08-28T17:02:59.911+01:00</updated><title type='text'>multiple WMS yaim problems</title><content type='html'>I was alerted today by our all new shiny jabber chatroom that we were publishing the same WMS via lcg-infosites. A quick check and there it was....&lt;br /&gt;&lt;code&gt;&lt;br /&gt;-bash-3.00$ lcg-infosites --vo camont wms&lt;br /&gt;https://svr022.gla.scotgrid.ac.uk:7443/glite_wms_wmproxy_server&lt;br /&gt;https://svr022.gla.scotgrid.ac.uk:7443/glite_wms_wmproxy_server&lt;br /&gt;https://lcgwms03.gridpp.rl.ac.uk:7443/glite_wms_wmproxy_server&lt;br /&gt;https://wms00.hep.ph.ic.ac.uk:7443/glite_wms_wmproxy_server&lt;br /&gt;&lt;/code&gt;&lt;br /&gt;We recently moved to the latest WMS 3.1 release last week and I thought it may have been down to that.  Upon further inspection I found the following GIP plugin:&lt;br /&gt;&lt;code&gt;&lt;br /&gt;svr023:/opt/glite/etc/gip/provider# ./glite-info-provider-service-wmproxy-wrapper&lt;br /&gt;&lt;/code&gt;&lt;br /&gt;which was publishing the wrong WMS.&lt;br /&gt;&lt;code&gt;&lt;br /&gt;GlueServiceEndpoint: https://svr022.gla.scotgrid.ac.uk:7443/glite_wms_wmproxy_server&lt;br /&gt;&lt;/code&gt;&lt;br /&gt;In the file the problem was an obvious one:&lt;br /&gt;&lt;code&gt;&lt;br /&gt;export WMPROXY_HOST=svr022.gla.scotgrid.ac.uk svr023.gla.scotgrid.ac.uk&lt;br /&gt;&lt;/code&gt;&lt;br /&gt;This begs the questios, can YAIM deal with more than one WMS and if so how do you specify them.  We had always gone for a quoted, space separated list in site-info.def &lt;br /&gt;i.e.&lt;br /&gt;&lt;code&gt;&lt;br /&gt;WMS_HOST="svr022.$MY_DOMAIN svr023.$MY_DOMAIN"&lt;br /&gt;&lt;/code&gt;&lt;br /&gt;but perhaps you can't do that any more and you need to override the WMS_HOST in a node specific way.  Oh well.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-5260753523362657223?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/5260753523362657223/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=5260753523362657223' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/5260753523362657223'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/5260753523362657223'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/08/multiple-wms-yaim-problems.html' title='multiple WMS yaim problems'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-1776062493639839439</id><published>2009-08-13T11:41:00.003+01:00</published><updated>2009-11-27T12:58:30.138Z</updated><category scheme='http://www.blogger.com/atom/ns#' term='mysql'/><title type='text'>Database backups, and lock time</title><content type='html'>Running a service creates data.  Running a service for a long time creates lots of data.&lt;br /&gt;&lt;br /&gt;In this case, the WMS and LB servers - we're sitting with about 18GB on each LB.  This is not a problem - they're well indexed against the usual queries (out of the box, &lt;a href="http://scotgrid.blogspot.com/2009/03/indices-and-dpms-story-of-optimisation.html"&gt;no fiddling required&lt;/a&gt;), so the old data isn't really noticed.&lt;br /&gt;&lt;br /&gt;Until you take a backup.&lt;br /&gt;&lt;br /&gt;Then, in order to get a consistant backup, it's locked for however long it takes to dump all that data.  Which is about 45 minutes.&lt;br /&gt;&lt;br /&gt;That's too long - it means we have some time when it's not available, it's getting noticed.  So, how can we take a backup, without locking the database for so long?&lt;br /&gt;&lt;br /&gt;There's various options for that, but the best looking (read: simplest) one is to enable binary logging in MySQL.  Because the tables used are all InnoDB, which is transactional, this means that the backup can mark a position in the log, and then use that to _not_ backup operations that came after it - which results in a consistant backup.  (If your using any MyISM tables, which are not transactional, you can't do this.  Hence the use of LVM snapshoting or other exotic techniques).&lt;br /&gt;&lt;br /&gt;This it really simple:  in the my.cnf for each service, put 'log-bin' (without the quotes) in the [mysqld] section, and restart.&lt;br /&gt;&lt;br /&gt;Binary logging is now enabled.&lt;br /&gt;&lt;br /&gt;Next, to take a lock free [0] dump, add the --single-transaction flag to mysqldump.&lt;br /&gt;&lt;br /&gt;The time taken to actually dump the data to disk won't change, but the database won't be locked for that time.&lt;br /&gt;&lt;br /&gt;I did this for one of our LB servers, and then, while the dump was running, submited a job through the WMS.  The job was assigned to the LB I was dumping, proving it can be written to, and has now completed, while the dump hasn't yet finished.&lt;br /&gt;&lt;br /&gt;I've modified our usual backup script, so that if it detects the presence of /var/lib/mysql/${hostname}-bin.index, which is the index for the binary log, it automatically uses --single-transaction.  That way, we still have a single backup script, but it does it the best way possible.&lt;br /&gt;&lt;br /&gt;There are a couple of downsides to binary logging:  It means the DB has to write more data to disk, so is about 1% slower.  As the services are not running at 99% of the cpu, that's ok for us.  It also means that each new piece of data is stored twice - once in the DB, and once in the log.  Therefore the data storage need grows twice as fast  - faster, if there are deletes to the database.  I'm looking at an 18GB database - so this won't be a problem.  Also, you can purge old logs, so I don't feel that this is  a problem any more than the risk of the database expanding over the partition size is.&lt;br /&gt;&lt;br /&gt;One thing I'll be looking at is useing the binary logs to take an incremental backup.  That'll still not lock the database, but will also be much smaller and faster to take.  That's a bit more complicated to arrange, so it'll go into the pile of 'ideas that look nice, but we don't think we need it yet'&lt;br /&gt;&lt;br /&gt;As an aside, I think this has to go down as one of the more anticlimatic updates - it was simple, quick and just worked.  Unless disk space is very tight, I can't see why one wouldn't enable it.&lt;br /&gt;&lt;br /&gt;[0] Technically, it takes a lock, waits for all pending transactions to complete, marks the log position, then releases it.  If you have slow operations in flight, it locks it for the duration of that operation.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-1776062493639839439?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/1776062493639839439/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=1776062493639839439' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/1776062493639839439'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/1776062493639839439'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/08/database-backups-and-lock-time.html' title='Database backups, and lock time'/><author><name>Stuart Purdie</name><uri>http://www.blogger.com/profile/08473287949581285669</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-859559678834687265</id><published>2009-08-12T10:33:00.006+01:00</published><updated>2009-08-12T12:42:40.889+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='NGS'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><category scheme='http://www.blogger.com/atom/ns#' term='VO'/><title type='text'>getting ngs.ac.uk voms to work</title><content type='html'>I have been looking into an issue with the NGS as they are testing submission to the WMS.  A ticket was raised as authentication failed on both our production CE's.&lt;br /&gt;&lt;br /&gt;This was recreated with by created an ngs voms proxy.&lt;br /&gt;&lt;code&gt;&lt;br /&gt;-bash-3.00$ voms-proxy-init -voms ngs.ac.uk --valid 240:00&lt;br /&gt;Cannot find file or dir: /clusterhome/home/gla057/.glite/vomses&lt;br /&gt;Enter GRID pass phrase:&lt;br /&gt;Your identity: /C=UK/O=eScience/OU=Glasgow/L=Compserv/CN=douglas mcnab&lt;br /&gt;Creating temporary proxy ................................................................ Done&lt;br /&gt;Contacting  voms.ngs.ac.uk:15010 [/C=UK/O=eScience/OU=Manchester/L=MC/CN=voms.ngs.ac.uk/Email=support@grid-support.ac.uk] "ngs.ac.uk" Done&lt;br /&gt;&lt;br /&gt;Warning: voms.ngs.ac.uk:15010: The validity of this VOMS AC in your proxy is shortened to 86400 seconds!&lt;br /&gt;&lt;br /&gt;Creating proxy ............................................................................ Done&lt;br /&gt;Your proxy is valid until Thu Aug 20 15:34:41 2009&lt;br /&gt;&lt;/code&gt;&lt;br /&gt;&lt;br /&gt;Then with a direct globus-job-run:&lt;br /&gt;&lt;code&gt;&lt;br /&gt;-bash-3.00$ globus-job-run svr021.gla.scotgrid.ac.uk:2119/jobmanager-lcgpbs "/bin/hostname -f"&lt;br /&gt;GRAM Job submission failed because authentication with the remote server failed (error code 7)&lt;br /&gt;-bash-3.00$ globus-job-run svr026.gla.scotgrid.ac.uk:2119/jobmanager-lcgpbs "/bin/hostname -f"&lt;br /&gt;GRAM Job submission failed because data transfer to the server failed (error code 10)&lt;br /&gt;&lt;code&gt;&lt;br /&gt;&lt;br /&gt;After much investigation, the long and short of it is that even with the correct entries in the groupmapfile and grid-mapfile the issue still occurred.  So I checked the VO certificate in /etc/grid-security/vomsdir.  This was fine, although there was also the /etc/grid-security/vomsdir/ngs.ac.uk/voms.ngs.ac.uk.lsc which may have been getting used before the VO certificate.  So to check I removed the /etc/grid-security/vomsdir/ngs.ac.uk/voms.ngs.ac.uk.lsc&lt;br /&gt;&lt;br /&gt;Hey presto, submission worked:&lt;br /&gt;&lt;code&gt;&lt;br /&gt;-bash-3.00$ globus-job-run svr026.gla.scotgrid.ac.uk:2119/jobmanager-lcgpbs "/bin/hostname -f"&lt;br /&gt;node295.beowulf.cluster&lt;br /&gt;-bash-3.00$ globus-job-run svr021.gla.scotgrid.ac.uk:2119/jobmanager-lcgpbs "/bin/hostname -f"&lt;br /&gt;node295.beowulf.cluster&lt;br /&gt;&lt;/code&gt;&lt;br /&gt;&lt;br /&gt;So I think there may be an issue with ngs.ac.uk VO and the lsc file which looked correct.&lt;br /&gt;&lt;br /&gt;&lt;code&gt;&lt;br /&gt;svr026:/etc/grid-security/vomsdir/ngs.ac.uk# cat voms.ngs.ac.uk.lsc&lt;br /&gt;/C=UK/O=eScience/OU=Manchester/L=MC/CN=voms.ngs.ac.uk/Email=support@grid-support.ac.uk&lt;br /&gt;/C=UK/O=eScienceCA/OU=Authority/CN=CA&lt;br /&gt;&lt;/code&gt;&lt;br /&gt;&lt;br /&gt;This will be an issue in the future on SL5 when VO certificates are deprecated for the lsc file.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-859559678834687265?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/859559678834687265/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=859559678834687265' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/859559678834687265'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/859559678834687265'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/08/getting-ngsacuk-voms-to-work.html' title='getting ngs.ac.uk voms to work'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-1394348581410947491</id><published>2009-08-12T10:14:00.003+01:00</published><updated>2009-08-12T10:33:04.239+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='SL5'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><title type='text'>the sl5 cluster grows</title><content type='html'>With a view to a full scale migration of Glasgow's worker nodes from sl4 to sl5 in September we have grown the size of our sl5 test cluster from 8 job slots to 112 job slots.&lt;br /&gt;&lt;br /&gt;This is accessible for submission to the following queues:&lt;br /&gt;&lt;br /&gt;dev010.gla.scotgrid.ac.uk:2119/jobmanager-lcgpbs-q30m&lt;br /&gt;dev010.gla.scotgrid.ac.uk:2119/jobmanager-lcgpbs-q6h&lt;br /&gt;dev010.gla.scotgrid.ac.uk:2119/jobmanager-lcgpbs-q1d&lt;br /&gt;dev010.gla.scotgrid.ac.uk:2119/jobmanager-lcgpbs-q2d&lt;br /&gt;dev010.gla.scotgrid.ac.uk:2119/jobmanager-lcgpbs-q3d&lt;br /&gt;&lt;br /&gt;Currently the CE is only advertising and accepting submissions from Atlas and queues are open to sgm/prd/pil account but I am more than welcome to open them to anyone who wishes to test.  Just drop me a line and I will create a test software area for any sgm account to install the application software via SL5 and allow access on the CE &amp; Batch Sys for running the jobs.&lt;br /&gt;&lt;br /&gt;So far things have been positive for Atlas with software kits now installing on SL5 and attempting to run kit validation.  Currently we are failing KV tests, more precisely it failed in the digitization phase, so we then failed the reconstruction.&lt;br /&gt;&lt;br /&gt;Nightly builds continue to be run so slowly but surely I'm sure these issues will be ironed out.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-1394348581410947491?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/1394348581410947491/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=1394348581410947491' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/1394348581410947491'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/1394348581410947491'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/08/sl5-cluster-grows.html' title='the sl5 cluster grows'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-3135965207833774724</id><published>2009-07-28T16:51:00.006+01:00</published><updated>2009-08-12T10:32:50.212+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='SL5'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><title type='text'>sl5 workers murmurings</title><content type='html'>Well we have had an sl5 test cluster for a while now but it has ever really seen any action other than the odd random hello world job from testing with cream and the like. &lt;br /&gt;However, as the great sl5 debate raged on we put ourself forward as a test site for atlas along with Oxford, another site with an sl5 cluster for installs of SL4 and future SL5 versions of the atlas software.&lt;br /&gt;&lt;br /&gt;However, in order to get my development CE visible to the real world I had to add it into our site bdii.  This then attracted ops and dteam jobs as by default they were allowed through the CE.  No great shakes and was actually good as it identified problems that had not been seen with simple hello world jobs from within ScotGrid.  &lt;br /&gt;&lt;br /&gt;The first mishap was a networking issue where the jobs could arrive but couldn't get their job wrapper and payload as most of our workers are NAT'd. Except my development one.  A simple fix once we worked out what was wrong.  &lt;br /&gt;&lt;br /&gt;Two other problems were encountered.  Firstly that CE-sft-lcg-rm-free test went into a warn state as the glite-WN package no longer pulls in ldapsearch.  This is fixed by installing openldap-clients from sl-base.  &lt;br /&gt;&lt;br /&gt;Secondly, the many of the jobs that actually did run through the system encountered an error on CE-sft-brokerinfo with something like: error while loading shared libraries: libclassad_ns.so.0: cannot open shared object file: No such file or directory&lt;br /&gt;After some googling, this bug is known about and has been fixed.  The fix is  adding gridpath_prepend "LD_LIBRARY_PATH" "/opt/classads/lib64/" to /etc/profile.d/grid-env.sh  However, at Glasgow we control grid-env.sh though cfengine so I needed to make the appropriate change there too.&lt;br /&gt;&lt;br /&gt;After going through this over the last few days I stumbled across &lt;a href="http://www.gridpp.ac.uk/wiki/SL5_Experience_May_2009"&gt;Ewan's page&lt;/a&gt; as he had encountered the exact same issues.  So take heed and do a spot of googling first!&lt;br /&gt;&lt;br /&gt;There is also a metapackage available for Sl5 glite3.2 WN's this should hopefully contain all the required dependencies.  This is located &lt;a href="http://grid-deployment.web.cern.ch/grid- deployment/download/HEP/rpm/HEP_OSlibs_SL5-1.0.0- 0.x86_64.rpm"&gt;here&lt;/a&gt;.  The gotcha with this is that you have to install it with yum localinstall or stick it in a yum repo as rpm -i doesn't work.&lt;br /&gt;&lt;br /&gt;I have also just compared what is installed from this against the &lt;a href="https://twiki.cern.ch/twiki/bin/view/Atlas/RPMCompatSLC5#SL5_issues"&gt;Atlas SL5 page&lt;/a&gt; and there were 4 packages missing: compat-gcc-34-g77, compat-libgcc-296, compat-libstdc++-296, ghostscript-8.15.2&lt;br /&gt;&lt;br /&gt;So currently we have ops/dteam jobs running and passing. Atlas software jobs running, completing but not successfully working.  More digging is required and I will keep you posted.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-3135965207833774724?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/3135965207833774724/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=3135965207833774724' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/3135965207833774724'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/3135965207833774724'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/07/sl5-workers-mummerings.html' title='sl5 workers murmurings'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-5451366428120643305</id><published>2009-07-15T16:17:00.002+01:00</published><updated>2009-07-15T16:37:07.886+01:00</updated><title type='text'>Rest in Peace gLite 3.0 ... finally</title><content type='html'>Today heralds a poignant day for the members of ScotGrid as we finally waved goodbye to the last gLite 3.0 Service (VOMS) and SL3.0 server in our cluster.  The sombre mood was only broken by the arrival of the newborn gLite 3.1 VOMS server running on SL4.  There was much flag waving and tears of joy as the first voms-proxy-init was issued and the shiny new web interface marvelled at.&lt;br /&gt;&lt;br /&gt;Again Jpackage caused a little confusion as tomcat5 pulls jdk6 unless you exclude it or force an install of jdk5.  This is preferred for all you firefox users out there.  As if tomcat is running under jdk6 you have to remember to turn off TLS1.0 from the preferences menu in order to get the SSL handshaking to work or you get a nice fat error page!  Not very useful for an admin screen let me tell you.&lt;br /&gt;&lt;br /&gt;This upgrade was tried last year but was hampered by a lack of database migration scripts.  This time around and with the help of these &lt;a href="https://twiki.cern.ch/twiki/bin/view/LCG/VOMS#Upgrading_from_a_gLite_3_0_VOMS"&gt;instructions&lt;/a&gt; it went swimmingly.&lt;br /&gt;&lt;br /&gt;So although it was a sad day for gLite 3.0 and SL3 camp and a small victory for gLite 3.1/SL4, the war is not over.  With gLite 3.2 and SL5 closing in on all fronts the battle is only just beginning.  &lt;br /&gt;&lt;br /&gt;p.s. we have an SL5 set-up so if you want to test, please let me know.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-5451366428120643305?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/5451366428120643305/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=5451366428120643305' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/5451366428120643305'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/5451366428120643305'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/07/rest-in-peace-glite-30-finally.html' title='Rest in Peace gLite 3.0 ... finally'/><author><name>dug mcnab</name><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-32189452.post-2456405591366220240</id><published>2009-07-06T22:03:00.004+01:00</published><updated>2009-07-07T06:54:16.959+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='ATLAS'/><category scheme='http://www.blogger.com/atom/ns#' term='UKI-SCOTGRID-GLASGOW'/><title type='text'>Deflected Cosmic Rays...</title><content type='html'>&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://3.bp.blogspot.com/_eLhn96sA3hw/SlJnqmrUYzI/AAAAAAAAAhI/ZweWo5P7XJk/s1600-h/drstats_CR_bycloud_all_TIER2S.gif"&gt;&lt;img style="margin: 0px auto 10px; display: block; text-align: center; cursor: pointer; width: 400px; height: 133px;" src="http://3.bp.blogspot.com/_eLhn96sA3hw/SlJnqmrUYzI/AAAAAAAAAhI/ZweWo5P7XJk/s400/drstats_CR_bycloud_all_TIER2S.gif" alt="" id="BLOGGER_PHOTO_ID_5355456888531346226" border="0" /&gt;&lt;/a&gt;&lt;br /&gt;This is the second short "when you're good..." post. During the RAL machine room move, we tested distributing ATLAS cosmics AOD and DPD data from CERN-&gt;GLASGOW-&gt;UK T2s. After some tweaking of the T2 FTS channels at CERN and tinkering in DDM  this has worked a charm. Data distrubution in the UK has gone very well throughout the current combined cosmics data taking runs.&lt;br /&gt;&lt;br /&gt;This is the first time that we tried circumventing the T1 for such an organised data distribution and it was a real success for the UK, ATLAS and Glasgow.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/32189452-2456405591366220240?l=scotgrid.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://scotgrid.blogspot.com/feeds/2456405591366220240/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=32189452&amp;postID=2456405591366220240' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/2456405591366220240'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/32189452/posts/default/2456405591366220240'/><link rel='alternate' type='text/html' href='http://scotgrid.blogspot.com/2009/07/deflected-cosmic-rays.html' title='Deflected Cosmic Rays...'/><author><name>Graeme Stewart</name><uri>http://www.blogger.com/profile/04113191724360870254</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='21' height='32' src='http://www.physics.gla.ac.uk/~graeme/graeme.jpg'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://3.bp.blogspot.com/_eLhn96sA3hw/SlJnqmrUYzI/AAAAAAAAAhI/ZweWo5P7XJk/s72-c/drstats_CR_bycloud_all_TIER2S.gif' height='72' width='72'/><thr:total>0</thr:total></entry></feed>
