| /** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| package org.apache.airavata.cluster.monitoring; |
| |
| import com.jcraft.jsch.Channel; |
| import com.jcraft.jsch.ChannelExec; |
| import com.jcraft.jsch.JSch; |
| import com.jcraft.jsch.Session; |
| import org.apache.airavata.common.exception.ApplicationSettingsException; |
| import org.apache.airavata.common.utils.ServerSettings; |
| import org.apache.airavata.credential.store.cpi.CredentialStoreService; |
| import org.apache.airavata.model.credential.store.SSHCredential; |
| import org.apache.airavata.model.appcatalog.computeresource.ComputeResourceDescription; |
| import org.apache.airavata.model.appcatalog.computeresource.JobSubmissionInterface; |
| import org.apache.airavata.model.appcatalog.computeresource.JobSubmissionProtocol; |
| import org.apache.airavata.model.appcatalog.gatewayprofile.ComputeResourcePreference; |
| import org.apache.airavata.model.status.QueueStatusModel; |
| import org.apache.airavata.registry.api.RegistryService; |
| import org.apache.thrift.TException; |
| import org.apache.thrift.protocol.TBinaryProtocol; |
| import org.apache.thrift.protocol.TProtocol; |
| import org.apache.thrift.transport.TSocket; |
| import org.apache.thrift.transport.TTransport; |
| import org.apache.thrift.transport.TTransportException; |
| import org.quartz.Job; |
| import org.quartz.JobExecutionContext; |
| import org.quartz.JobExecutionException; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| import java.io.InputStream; |
| import java.util.ArrayList; |
| import java.util.List; |
| |
| public class ClusterStatusMonitorJob implements Job { |
| private final static Logger logger = LoggerFactory.getLogger(ClusterStatusMonitorJob.class); |
| |
| |
| @Override |
| public void execute(JobExecutionContext jobExecutionContext) throws JobExecutionException { |
| try{ |
| String superTenantGatewayId = ServerSettings.getSuperTenantGatewayId(); |
| RegistryService.Client registryClient = getRegistryClient(); |
| List<ComputeResourceProfile> computeResourceProfiles = new ArrayList<>(); |
| List<ComputeResourcePreference> computeResourcePreferences = null; |
| try{ |
| computeResourcePreferences = registryClient.getAllGatewayComputeResourcePreferences(superTenantGatewayId); |
| }catch (Exception ex){ |
| logger.warn("Could not find super tenant compute resources preferences for cluster status monitoring..."); |
| } |
| if (computeResourcePreferences != null && computeResourcePreferences.size() > 0) { |
| computeResourcePreferences.stream().forEach(p -> { |
| try { |
| String computeResourceId = p.getComputeResourceId(); |
| String credentialStoreToken = p.getResourceSpecificCredentialStoreToken(); |
| String loginUserName = p.getLoginUserName(); |
| String hostName = null; |
| if (credentialStoreToken == null || credentialStoreToken.equals("")) { |
| credentialStoreToken = registryClient.getGatewayResourceProfile(superTenantGatewayId).getCredentialStoreToken(); |
| } |
| int port = -1; |
| ArrayList queueNames = new ArrayList<>(); |
| |
| ComputeResourceDescription computeResourceDescription = registryClient.getComputeResource(computeResourceId); |
| hostName = computeResourceDescription.getHostName(); |
| //FIXME This should come from compute resource description |
| port = 22; |
| computeResourceDescription.getBatchQueues().stream().forEach(q -> { |
| queueNames.add(q.getQueueName()); |
| }); |
| |
| List<JobSubmissionInterface> jobSubmissionInterfaces = computeResourceDescription.getJobSubmissionInterfaces(); |
| if (jobSubmissionInterfaces != null && jobSubmissionInterfaces.size() > 0) { |
| if (jobSubmissionInterfaces.get(0).getJobSubmissionProtocol().equals(JobSubmissionProtocol.SSH)) { |
| String resourceManagerType = registryClient.getSSHJobSubmission(jobSubmissionInterfaces.get(0) |
| .getJobSubmissionInterfaceId()).getResourceJobManager().getResourceJobManagerType().name(); |
| ComputeResourceProfile computeResourceProfile = new ComputeResourceProfile(hostName, |
| loginUserName, port, credentialStoreToken, queueNames, resourceManagerType); |
| computeResourceProfiles.add(computeResourceProfile); |
| } |
| } |
| } catch (TException e) { |
| logger.error(e.getMessage()); |
| } |
| }); |
| } |
| |
| ArrayList<QueueStatusModel> queueStatuses = new ArrayList<>(); |
| |
| for (ComputeResourceProfile computeResourceProfile : computeResourceProfiles) { |
| |
| String userName = computeResourceProfile.getUserName(); |
| String hostName = computeResourceProfile.getHostName(); |
| int port = computeResourceProfile.getPort(); |
| |
| try { |
| JSch jsch = new JSch(); |
| CredentialStoreService.Client credentialClient = getCredentialStoreClient(); |
| SSHCredential sshCredential = credentialClient.getSSHCredential(computeResourceProfile.getCredentialStoreToken(), superTenantGatewayId); |
| jsch.addIdentity(hostName, sshCredential.getPrivateKey().getBytes(), sshCredential.getPublicKey().getBytes(), sshCredential.getPassphrase().getBytes()); |
| |
| Session session = jsch.getSession(userName, hostName, port); |
| java.util.Properties config = new java.util.Properties(); |
| config.put("StrictHostKeyChecking", "no"); |
| session.setConfig(config); |
| |
| logger.debug("Connected to " + hostName); |
| |
| session.connect(); |
| for (String queue : computeResourceProfile.getQueueNames()) { |
| String command = ""; |
| if (computeResourceProfile.getResourceManagerType().equals("SLURM")) |
| command = "sinfo -s -p " + queue + " -o \"%a %F\" | tail -1"; |
| else if (computeResourceProfile.getResourceManagerType().equals("PBS")) |
| command = "qstat -Q " + queue + "| tail -1"; |
| |
| if (command.equals("")) { |
| logger.warn("No matching resource manager type found for " + computeResourceProfile.getResourceManagerType()); |
| continue; |
| } |
| |
| Channel channel = session.openChannel("exec"); |
| ((ChannelExec) channel).setCommand(command); |
| channel.setInputStream(null); |
| ((ChannelExec) channel).setErrStream(System.err); |
| InputStream in = channel.getInputStream(); |
| channel.connect(); |
| byte[] tmp = new byte[1024]; |
| String result = ""; |
| while (true) { |
| while (in.available() > 0) { |
| int i = in.read(tmp, 0, 1024); |
| if (i < 0) break; |
| result += new String(tmp, 0, i); |
| } |
| if (channel.isClosed()) { |
| if (in.available() > 0) continue; |
| logger.debug(hostName + " " + queue + " " + "exit-status: " + channel.getExitStatus()); |
| break; |
| } |
| try { |
| Thread.sleep(1000); |
| } catch (Exception ee) { |
| } |
| } |
| channel.disconnect(); |
| |
| if (result != null && result.length() > 0) { |
| QueueStatusModel queueStatus = null; |
| if (computeResourceProfile.getResourceManagerType().equals("SLURM")) { |
| String[] sparts = result.split(" "); |
| boolean isUp = sparts[0].equalsIgnoreCase("up"); |
| String knts = sparts[1]; |
| sparts = knts.split("/"); |
| int running = Integer.parseInt(sparts[0].trim()); |
| int queued = Integer.parseInt(sparts[1].trim()); |
| queueStatus = new QueueStatusModel(hostName, queue, isUp, running, queued, System.currentTimeMillis()); |
| |
| } else if (computeResourceProfile.getResourceManagerType().equals("PBS")) { |
| result = result.replaceAll("\\s+", " "); |
| String[] sparts = result.split(" "); |
| boolean isUp = sparts[3].equalsIgnoreCase("yes"); |
| int running = Integer.parseInt(sparts[6].trim()); |
| int queued = Integer.parseInt(sparts[5].trim()); |
| queueStatus = new QueueStatusModel(hostName, queue, isUp, running, queued, System.currentTimeMillis()); |
| } |
| |
| if (queueStatus != null) |
| queueStatuses.add(queueStatus); |
| } |
| } |
| session.disconnect(); |
| } catch (Exception ex) { |
| logger.error("Failed to get cluster status from " + computeResourceProfile.getHostName()); |
| logger.error(ex.getMessage(), ex); |
| } |
| } |
| if(queueStatuses != null && queueStatuses.size() > 0){ |
| registryClient.registerQueueStatuses(queueStatuses); |
| } |
| }catch (Exception e){ |
| throw new JobExecutionException(e); |
| } |
| |
| } |
| private static RegistryService.Client getRegistryClient() throws TTransportException, ApplicationSettingsException { |
| TTransport transport = new TSocket(ServerSettings.getRegistryServerHost(), Integer.parseInt(ServerSettings.getRegistryServerPort())); |
| transport.open(); |
| TProtocol protocol = new TBinaryProtocol(transport); |
| RegistryService.Client registryClient = new RegistryService.Client(protocol); |
| return registryClient; |
| } |
| |
| private static CredentialStoreService.Client getCredentialStoreClient() throws TTransportException, ApplicationSettingsException { |
| TTransport transport = new TSocket(ServerSettings.getCredentialStoreServerHost(), Integer.parseInt(ServerSettings.getCredentialStoreServerPort())); |
| transport.open(); |
| TProtocol protocol = new TBinaryProtocol(transport); |
| CredentialStoreService.Client credentialServiceClient = new CredentialStoreService.Client(protocol); |
| return credentialServiceClient; |
| } |
| |
| private static class ComputeResourceProfile { |
| |
| private String hostName; |
| private String userName; |
| private int port; |
| private String credentialStoreToken; |
| private List<String> queueNames; |
| private String resourceManagerType; |
| |
| public ComputeResourceProfile(String hostName, String userName, int port, String credentialStoreToken, List<String> queueNames, String resourceManagerType) { |
| this.hostName = hostName; |
| this.userName = userName; |
| this.port = port; |
| this.credentialStoreToken = credentialStoreToken; |
| this.queueNames = queueNames; |
| this.resourceManagerType = resourceManagerType; |
| } |
| |
| public String getHostName() { |
| return hostName; |
| } |
| |
| public void setHostName(String hostName) { |
| this.hostName = hostName; |
| } |
| |
| public String getUserName() { |
| return userName; |
| } |
| |
| public void setUserName(String userName) { |
| this.userName = userName; |
| } |
| |
| public int getPort() { |
| return port; |
| } |
| |
| public void setPort(int port) { |
| this.port = port; |
| } |
| |
| public String getCredentialStoreToken() { |
| return credentialStoreToken; |
| } |
| |
| public void setCredentialStoreToken(String credentialStoreToken) { |
| this.credentialStoreToken = credentialStoreToken; |
| } |
| |
| public List<String> getQueueNames() { |
| return queueNames; |
| } |
| |
| public void setQueueNames(List<String> queueNames) { |
| this.queueNames = queueNames; |
| } |
| |
| public String getResourceManagerType() { |
| return resourceManagerType; |
| } |
| |
| public void setResourceManagerType(String resourceManagerType) { |
| this.resourceManagerType = resourceManagerType; |
| } |
| } |
| } |