service經常restart,找不到原因。
RHEL AS3裡面,兩個節點組成cluster,節點二上的service一直以來都無端restart,不知道什麼原因,syslog裡面的記錄:
Jun 16 04:12:31 conn02 clusvcmgrd: <crit> Couldn't connect to member #0: Broken pipe
Jun 16 04:12:31 conn02 clusvcmgrd: <err> Unable to obtain cluster lock: No locks available
Jun 16 04:12:31 conn02 clusvcmgrd: <warning> Restarting locally failed service app
Jun 16 04:12:37 conn02 clusvcmgrd: <crit> Couldn't connect to member #0: Broken pipe
Jun 16 04:12:37 conn02 clusvcmgrd: <err> Unable to obtain cluster lock: No locks available
Jun 16 06:21:36 conn02 clusvcmgrd: <err> Unable to obtain cluster lock: Broken pipe
Jun 16 06:21:36 conn02 clusvcmgrd: <warning> Restarting locally failed service app
Jun 16 06:21:36 conn02 clusvcmgrd: : <notice> service notice: Stopping service app ...
Jun 16 06:21:36 conn02 clusvcmgrd: : <notice> service notice: Running user script '/scripts/app.sh stop'
Jun 16 06:22:09 conn02 clusvcmgrd: : <notice> service notice: Stopped service app ...
Jun 16 06:22:09 conn02 clusvcmgrd: <notice> Starting stopped service app
Jun 16 06:22:09 conn02 clusvcmgrd: : <notice> service notice: Starting service app ...
Jun 16 06:22:10 conn02 clusvcmgrd: : <notice> service notice: Running user script '/scripts/app.sh start'
Jun 16 06:22:10 conn02 clusvcmgrd: : <notice> service notice: Started service app ...
cluster的幾日也記錄了service的狀態,通常都是把service stop,很快之後又把service startup。
節點一不會有這樣的情況發生。
cluster.xml見下。
<?xml version="1.0"?>
<cluconfig version="3.0">
<clumembd broadcast="no" interval="1000000" loglevel="5" multicast="yes" multicast_ipaddress="225.0.0.11" thread="yes" tko_count="20"/>
<cluquorumd loglevel="5" pinginterval="2" tiebreaker_ip=""/>
<clurmtabd loglevel="5" pollinterval="4"/>
<clusvcmgrd loglevel="5"/>
<clulockd loglevel="5"/>
<cluster config_viewnumber="74" key="152564bfc7a0b536a6a6d7ba0c5d00bc" name="foxsz"/>
<sharedstate driver="libsharedraw.so" rawprimary="/dev/raw/raw1" rawshadow="/dev/raw/raw2" type="raw"/>
<members>
<member id="0" name="conn02" watchdog="yes">
</member>
<member id="1" name="conn01" watchdog="yes">
</member>
</members>
<services>
<service checkinterval="10" failoverdomain="db" id="0" name="db" userscript="/scripts/db.sh">
<service_ipaddresses>
<service_ipaddress broadcast="10.207.195.47" id="0" ipaddress="10.207.195.37" netmask="255.255.255.240"/>
</service_ipaddresses>
<device id="0" name="/dev/oravg/lvol1" sharename="">
<mount forceunmount="yes" fstype="ext3" mountpoint="/app/oracle/product" options="noatime"/>
</device>
<device id="1" name="/dev/oravg/lvol2" sharename="">
<mount forceunmount="yes" fstype="ext3" mountpoint="/app/oracle/index2" options="noatime"/>
</device>
<device id="2" name="/dev/oravg/lvol3" sharename="">
<mount forceunmount="yes" fstype="ext3" mountpoint="/app/oracle/index1" options="noatime"/>
</device>
<device id="3" name="/dev/oravg/lvol4" sharename="">
<mount forceunmount="yes" fstype="ext3" mountpoint="/app/oracle/backup" options="noatime"/>
</device>
<device id="4" name="/dev/oravg/lvol5" sharename="">
<mount forceunmount="yes" fstype="ext3" mountpoint="/app/oracle/data2" options="noatime"/>
</device>
<device id="5" name="/dev/oravg/lvol6" sharename="">
<mount forceunmount="yes" fstype="ext3" mountpoint="/app/oracle/data1" options="noatime"/>
</device>
<device id="6" name="/dev/oravg/lvol7" sharename="">
<mount forceunmount="yes" fstype="ext3" mountpoint="/app/oracle/coredbf" options="noatime"/>
</device>
<device id="7" name="/dev/oravg/lvol8" sharename="">
<mount forceunmount="yes" fstype="ext3" mountpoint="/app/oracle/archive" options="noatime"/>
</device>
<device id="8" name="/dev/oravg2/lvol1" sharename="">
<mount forceunmount="yes" fstype="ext3" mountpoint="/app/oracle/redo_log1" options="noatime"/>
</device>
<device id="9" name="/dev/oravg2/lvol2" sharename="">
<mount forceunmount="yes" fstype="ext3" mountpoint="/app/oracle/redo_log2" options="noatime"/>
</device>
</service>
<service checkinterval="10" failoverdomain="app" id="1" name="app" userscript="/scripts/app.sh">
<service_ipaddresses>
<service_ipaddress broadcast="10.207.195.47" id="0" ipaddress="10.207.195.40" netmask="255.255.255.240"/>
</service_ipaddresses>
<device id="0" name="/dev/appvg/lvol1" sharename="users">
<mount forceunmount="yes" fstype="ext3" mountpoint="/users" options="noatime"/>
<nfsexport id="0" name="/users">
<client id="0" name="*" options="rw"/>
</nfsexport>
</device>
<device id="1" name="/dev/appvg/lvol2" sharename="users1">
<mount forceunmount="yes" fstype="ext3" mountpoint="/app/appserv" options="noatime"/>
</device>
</service>
</services>
<failoverdomains>
<failoverdomain id="0" name="db" ordered="yes" restricted="yes">
<failoverdomainnode id="0" name="conn01"/>
<failoverdomainnode id="1" name="conn02"/>
</failoverdomain>
<failoverdomain id="1" name="app" ordered="yes" restricted="yes">
<failoverdomainnode id="0" name="conn02"/>
<failoverdomainnode id="1" name="conn01"/>
</failoverdomain>
</failoverdomains>
</cluconfig>
這個問題會不會跟網路有關係,這套系統的心跳和數據網線都連接在同一個switch上。我也不知道這種情況該怎麼定義,我覺得不是failover,但為什麼cluster會把service stop掉呢?
《解決方案》
水晶頭的問題吧
我單位以前也這樣
《解決方案》
回復 #2 00306 的帖子
配置文件是沒有多少出入的,就只有這個需要檢測了