Skip to content

Commit c5203b0

Browse files
haibchensunilgovind
authored andcommittedNov 9, 2018
YARN-8990. Fix fair scheduler race condition in app submit and queue cleanup. (Contributed by Wilfred Spiegelenburg)
(cherry picked from commit 524a752)
1 parent 68e514d commit c5203b0

File tree

3 files changed

+104
-42
lines changed

3 files changed

+104
-42
lines changed
 

‎hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSLeafQueue.java

+14
Original file line numberDiff line numberDiff line change
@@ -651,4 +651,18 @@ public void addAssignedApp(ApplicationId applicationId) {
651651
writeLock.unlock();
652652
}
653653
}
654+
655+
/**
656+
* This method is called when an application is removed from this queue
657+
* during the submit process.
658+
* @param applicationId the application's id
659+
*/
660+
public void removeAssignedApp(ApplicationId applicationId) {
661+
writeLock.lock();
662+
try {
663+
assignedApps.remove(applicationId);
664+
} finally {
665+
writeLock.unlock();
666+
}
667+
}
654668
}

‎hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java

+14-5
Original file line numberDiff line numberDiff line change
@@ -473,7 +473,7 @@ protected void addApplication(ApplicationId applicationId,
473473
writeLock.lock();
474474
try {
475475
RMApp rmApp = rmContext.getRMApps().get(applicationId);
476-
FSLeafQueue queue = assignToQueue(rmApp, queueName, user);
476+
FSLeafQueue queue = assignToQueue(rmApp, queueName, user, applicationId);
477477
if (queue == null) {
478478
return;
479479
}
@@ -499,6 +499,7 @@ protected void addApplication(ApplicationId applicationId,
499499
applicationId, queue.getName(),
500500
invalidAMResourceRequests, queue.getMaxShare());
501501
rejectApplicationWithMessage(applicationId, msg);
502+
queue.removeAssignedApp(applicationId);
502503
return;
503504
}
504505
}
@@ -513,14 +514,14 @@ protected void addApplication(ApplicationId applicationId,
513514
+ " cannot submit applications to queue " + queue.getName()
514515
+ "(requested queuename is " + queueName + ")";
515516
rejectApplicationWithMessage(applicationId, msg);
517+
queue.removeAssignedApp(applicationId);
516518
return;
517519
}
518520

519521
SchedulerApplication<FSAppAttempt> application =
520522
new SchedulerApplication<FSAppAttempt>(queue, user);
521523
applications.put(applicationId, application);
522524
queue.getMetrics().submitApp(user);
523-
queue.addAssignedApp(applicationId);
524525

525526
LOG.info("Accepted application " + applicationId + " from user: " + user
526527
+ ", in queue: " + queue.getName()
@@ -597,11 +598,19 @@ protected void addApplicationAttempt(
597598
}
598599

599600
/**
600-
* Helper method that attempts to assign the app to a queue. The method is
601-
* responsible to call the appropriate event-handler if the app is rejected.
601+
* Helper method for the tests to assign the app to a queue.
602602
*/
603603
@VisibleForTesting
604604
FSLeafQueue assignToQueue(RMApp rmApp, String queueName, String user) {
605+
return assignToQueue(rmApp, queueName, user, null);
606+
}
607+
608+
/**
609+
* Helper method that attempts to assign the app to a queue. The method is
610+
* responsible to call the appropriate event-handler if the app is rejected.
611+
*/
612+
private FSLeafQueue assignToQueue(RMApp rmApp, String queueName, String user,
613+
ApplicationId applicationId) {
605614
FSLeafQueue queue = null;
606615
String appRejectMsg = null;
607616

@@ -611,7 +620,7 @@ FSLeafQueue assignToQueue(RMApp rmApp, String queueName, String user) {
611620
if (queueName == null) {
612621
appRejectMsg = "Application rejected by queue placement policy";
613622
} else {
614-
queue = queueMgr.getLeafQueue(queueName, true);
623+
queue = queueMgr.getLeafQueue(queueName, true, applicationId);
615624
if (queue == null) {
616625
appRejectMsg = queueName + " is not a leaf queue";
617626
}

‎hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/QueueManager.java

+76-37
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
import org.apache.hadoop.classification.InterfaceAudience.Private;
3939
import org.apache.hadoop.classification.InterfaceStability.Unstable;
4040
import org.apache.hadoop.conf.Configuration;
41+
import org.apache.hadoop.yarn.api.records.ApplicationId;
4142
import org.apache.hadoop.yarn.conf.YarnConfiguration;
4243
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.policies.FifoPolicy;
4344
import org.xml.sax.SAXException;
@@ -71,7 +72,7 @@ private void execute() {
7172
Boolean removed =
7273
removeEmptyIncompatibleQueues(queueToCreate, queueType).orElse(null);
7374
if (Boolean.TRUE.equals(removed)) {
74-
FSQueue queue = getQueue(queueToCreate, true, queueType, false);
75+
FSQueue queue = getQueue(queueToCreate, true, queueType, false, null);
7576
if (queue != null &&
7677
// if queueToCreate is present in the allocation config, set it
7778
// to static
@@ -124,30 +125,49 @@ public void initialize(Configuration conf) throws IOException,
124125

125126
/**
126127
* Get a leaf queue by name, creating it if the create param is
127-
* true and is necessary.
128-
* If the queue is not or can not be a leaf queue, i.e. it already exists as a
129-
* parent queue, or one of the parents in its name is already a leaf queue,
130-
* null is returned.
128+
* <code>true</code> and the queue does not exist.
129+
* If the queue is not or can not be a leaf queue, i.e. it already exists as
130+
* a parent queue, or one of the parents in its name is already a leaf queue,
131+
* <code>null</code> is returned.
131132
*
132133
* The root part of the name is optional, so a queue underneath the root
133134
* named "queue1" could be referred to as just "queue1", and a queue named
134135
* "queue2" underneath a parent named "parent1" that is underneath the root
135136
* could be referred to as just "parent1.queue2".
137+
* @param name name of the queue
138+
* @param create <code>true</code> if the queue must be created if it does
139+
* not exist, <code>false</code> otherwise
140+
* @return the leaf queue or <code>null</code> if the queue cannot be found
136141
*/
137142
public FSLeafQueue getLeafQueue(String name, boolean create) {
138-
return getLeafQueue(name, create, true);
143+
return getLeafQueue(name, create, null, true);
139144
}
140145

141-
private FSLeafQueue getLeafQueue(
142-
String name,
143-
boolean create,
144-
boolean recomputeSteadyShares) {
145-
FSQueue queue = getQueue(
146-
name,
147-
create,
148-
FSQueueType.LEAF,
149-
recomputeSteadyShares
150-
);
146+
/**
147+
* Get a leaf queue by name, creating it if the create param is
148+
* <code>true</code> and the queue does not exist.
149+
* If the queue is not or can not be a leaf queue, i.e. it already exists as
150+
* a parent queue, or one of the parents in its name is already a leaf queue,
151+
* <code>null</code> is returned.
152+
*
153+
* If the application will be assigned to the queue if the applicationId is
154+
* not <code>null</code>
155+
* @param name name of the queue
156+
* @param create <code>true</code> if the queue must be created if it does
157+
* not exist, <code>false</code> otherwise
158+
* @param applicationId the application ID to assign to the queue
159+
* @return the leaf queue or <code>null</code> if teh queue cannot be found
160+
*/
161+
public FSLeafQueue getLeafQueue(String name, boolean create,
162+
ApplicationId applicationId) {
163+
return getLeafQueue(name, create, applicationId, true);
164+
}
165+
166+
private FSLeafQueue getLeafQueue(String name, boolean create,
167+
ApplicationId applicationId,
168+
boolean recomputeSteadyShares) {
169+
FSQueue queue = getQueue(name, create, FSQueueType.LEAF,
170+
recomputeSteadyShares, applicationId);
151171
if (queue instanceof FSParentQueue) {
152172
return null;
153173
}
@@ -168,42 +188,55 @@ public boolean removeLeafQueue(String name) {
168188

169189
/**
170190
* Get a parent queue by name, creating it if the create param is
171-
* true and is necessary.
172-
* If the queue is not or can not be a parent queue,
173-
* i.e. it already exists as a
174-
* leaf queue, or one of the parents in its name is already a leaf queue,
175-
* null is returned.
191+
* <code>true</code> and the queue does not exist.
192+
* If the queue is not or can not be a parent queue, i.e. it already exists
193+
* as a leaf queue, or one of the parents in its name is already a leaf
194+
* queue, <code>null</code> is returned.
176195
*
177196
* The root part of the name is optional, so a queue underneath the root
178197
* named "queue1" could be referred to as just "queue1", and a queue named
179198
* "queue2" underneath a parent named "parent1" that is underneath the root
180199
* could be referred to as just "parent1.queue2".
200+
* @param name name of the queue
201+
* @param create <code>true</code> if the queue must be created if it does
202+
* not exist, <code>false</code> otherwise
203+
* @return the parent queue or <code>null</code> if the queue cannot be found
181204
*/
182205
public FSParentQueue getParentQueue(String name, boolean create) {
183206
return getParentQueue(name, create, true);
184207
}
185208

186-
public FSParentQueue getParentQueue(
187-
String name,
188-
boolean create,
209+
/**
210+
* Get a parent queue by name, creating it if the create param is
211+
* <code>true</code> and the queue does not exist.
212+
* If the queue is not or can not be a parent queue, i.e. it already exists
213+
* as a leaf queue, or one of the parents in its name is already a leaf
214+
* queue, <code>null</code> is returned.
215+
*
216+
* The root part of the name is optional, so a queue underneath the root
217+
* named "queue1" could be referred to as just "queue1", and a queue named
218+
* "queue2" underneath a parent named "parent1" that is underneath the root
219+
* could be referred to as just "parent1.queue2".
220+
* @param name name of the queue
221+
* @param create <code>true</code> if the queue must be created if it does
222+
* not exist, <code>false</code> otherwise
223+
* @param recomputeSteadyShares <code>true</code> if the steady fair share
224+
* should be recalculated when a queue is added,
225+
* <code>false</code> otherwise
226+
* @return the parent queue or <code>null</code> if the queue cannot be found
227+
*/
228+
public FSParentQueue getParentQueue(String name, boolean create,
189229
boolean recomputeSteadyShares) {
190-
FSQueue queue = getQueue(
191-
name,
192-
create,
193-
FSQueueType.PARENT,
194-
recomputeSteadyShares
195-
);
230+
FSQueue queue = getQueue(name, create, FSQueueType.PARENT,
231+
recomputeSteadyShares, null);
196232
if (queue instanceof FSLeafQueue) {
197233
return null;
198234
}
199235
return (FSParentQueue) queue;
200236
}
201237

202-
private FSQueue getQueue(
203-
String name,
204-
boolean create,
205-
FSQueueType queueType,
206-
boolean recomputeSteadyShares) {
238+
private FSQueue getQueue(String name, boolean create, FSQueueType queueType,
239+
boolean recomputeSteadyShares, ApplicationId applicationId) {
207240
boolean recompute = recomputeSteadyShares;
208241
name = ensureRootPrefix(name);
209242
FSQueue queue;
@@ -215,8 +248,14 @@ private FSQueue getQueue(
215248
} else {
216249
recompute = false;
217250
}
251+
// At this point the queue exists and we need to assign the app if to the
252+
// but only to a leaf queue
253+
if (applicationId != null && queue instanceof FSLeafQueue) {
254+
((FSLeafQueue)queue).addAssignedApp(applicationId);
255+
}
218256
}
219-
if (recompute) {
257+
// Don't recompute if it is an existing queue or no change was made
258+
if (recompute && queue != null) {
220259
rootQueue.recomputeSteadyShares();
221260
}
222261
return queue;
@@ -614,7 +653,7 @@ private void ensureQueueExistsAndIsCompatibleAndIsStatic(
614653
incompatibleQueuesPendingRemoval.add(
615654
new IncompatibleQueueRemovalTask(name, queueType));
616655
} else {
617-
FSQueue queue = getQueue(name, true, queueType, false);
656+
FSQueue queue = getQueue(name, true, queueType, false, null);
618657
if (queue != null) {
619658
queue.setDynamic(false);
620659
}

0 commit comments

Comments
 (0)
Please sign in to comment.